Data preprocessing

Uploading, merging, and editing source datasets

info_healthy <- read_xlsx ("data/raw/final_health_statistic.xlsx") #metadata of healthy subjects 
info_ibs <- read_xlsx ("data/raw/final_ibs_141_statistic.xlsx") ##metadata of IBS subjects

combined_info dataset

#Combining `info_healthy` and `info_ibs` into a single dataset `combined_info` with subsequent content editing
combined_info <- info_healthy %>% 
  bind_rows(info_ibs) %>%
  mutate (
    BMI_min = ifelse (is.na (BMI_min), round (Weight_min /(Height_max/100 * Height_max/100), 2), BMI_min),
    BMI_max = ifelse (is.na (BMI_max), round (Weight_max /(Height_min/100 * Height_min/100), 2), BMI_max)
    ) %>% 
  unite("BMI_range", BMI_min, BMI_max, sep = "-", na.rm = TRUE) %>%
  unite ("Age_range", Age_min, Age_max, sep = "-", na.rm = TRUE) %>%
  mutate(
    Age_range = case_when(
      Age_range == "18-40" | Age_range == "23-28" | Age_range == "16-42" | Age_range == "21-43" ~ "16-43",
      Age <= 43 ~ "16-43",
      Age > 43 ~ "> 43",
      TRUE ~ NA_character_), #group 28-54 has been deleted
    research_ID = sub ("research_", "", research_ID),
    research_ID = case_when(
      research_ID == 0 ~ 1,
      research_ID == 1 ~ 2,
      research_ID == 2 ~ 3,
      research_ID == 3 ~ 4,
      research_ID == 4 ~ 5, 
      research_ID == 6 ~ 6, 
      research_ID == 7 ~ 7), 
    patient_ID = row_number(),
    Sex = ifelse (Sex == "mixed", NA, Sex),
    Smoking = sub ("never", "Never",  Smoking),
    Smoking = case_when(
      Smoking == "No" ~ "No",
      Smoking == "Never" ~ "No",
      Smoking == "Rarely (a few times/month)" ~ "Yes", #5 people
      Smoking == "Occasionally (1-2 times/week)" ~ "Yes",  #3 people
      Smoking == "Regularly (3-5 times/week)" ~ "Yes",  #1 people
      Smoking == "Daily" ~ "Yes"), #7 чел.
    Alcohol = sub ("rarely", "Rarely", Alcohol),
    Alcohol = ifelse(Alcohol == "Regularly (3-5 times/week)"|
                       Alcohol == "Daily", #3 чел.
                     "Regularly (3-7 times/week)", Alcohol),
    Antibiotics_usage = case_when(
      Antibiotics_usage == "Month" | Antibiotics_usage == ~ "3 months" |
        Antibiotics_usage == "6 months" ~ "1-6 months",
        #2 IBS people - Month, 0 healthy people - 3 months, 0 healthy people - 6 months
      Antibiotics_usage == "Year" | Antibiotics_usage == "Not use"  ~ 
        "12 months/Not use"), # 0 healthy people - 6-12 months, zero IBS people - Not use
    Hygiene = case_when(
      Hygiene == "Occasionally (1-2 times/week) cosmetics" ~ "Occasionally cosmetics (1-2 times/week)",
      Hygiene == "Rarely (a few times/month) cosmetics" ~ "Rarely cosmetics (a few times/month)",
      TRUE ~ Hygiene),
    Hygiene = case_when(
      Hygiene == "Daily cosmetics"|Hygiene == "Regularly cosmetics (3-5 times/week)" ~ "Regularly (3-7 times/week)", #0 чел. больных  для 3-5 times/week
      Hygiene == "Occasionally cosmetics (1-2 times/week)" | Hygiene == "Rarely cosmetics (a few times/month)" ~ "Occasionally (a few-8 times/month)",
      #только 2 чел. больных для 1-2 times/week
      Hygiene == "Never cosmetics" ~ "Never"),
    Physical_activity = sub ("regularly", "Regularly",  Smoking),
    BMI = ifelse (is.na(Weight_kg), BMI, Weight_kg/ (Height_cm/100 * Height_cm/100)),
    BMI_range = ifelse(BMI_range == "", NA, BMI_range),
    BMI_category = case_when(
      BMI_range == "18-25" ~ "normal/overweight",
      BMI_range == "19.21-29.29" ~ "normal/overweight",
      BMI_range == "20.6-29.6" ~ "normal/overweight",
      BMI_range == "21.74-28.38" ~ "normal/overweight",
      BMI_range == "18.5-30.8" ~ "normal/overweight", #a little over 30
      BMI < 18.5 ~ "underweight",
      BMI >= 18.5 & BMI < 30 ~ "normal/overweight",
      BMI >= 30 ~ "obese")
    ) %>% 
  rename ("Cosmetics" = Hygiene ) %>% 
  
  mutate_if (is.character, as.factor) %>% 
  
  select(-c(
    Instrument, # unique (combined_info$Instrument) = "Illumina MiSeq" 
    Isolation_source, # unique (combined_info$Isolation_source) = "faeces" 
    Assay_type, # unique (combined_info$Assay_type) = "AMPLICON"
    Target_gene, # unique (combined_info$Target_gene) = "16S"
    Main_Disease, # unique (combined_info$Main_Disease) = NA (for healthy), 141 (for IBS)
    Drugs, # unique (combined_info$Drugs) = NA
    Social_status, # unique (combined_info$Social_status) =  NA, urban 
    Weight_kg, Height_cm, Weight_min, Weight_max, Height_min, Height_max, BMI_range, #have been used to create the variable "BMI_category" and to reduce the amount of NA in "BMI"
    BMI, # NA for all healthy people
    Birth_Year, # no additional information
    Pets_type # unique (combined_info$Pets_type = cat, NA
  ))

rm (info_healthy, info_ibs)

#install.packages("summarytools")
library(summarytools)

saved_x11_option <- st_options("use.x11")
st_options(use.x11 = FALSE)

combined_info %>% 
  select(- patient_ID) %>% 
  dfSummary() %>% 
  print (method = "render")

Data Frame Summary

combined_info

Dimensions: 381 x 21
Duplicates: 244
No Variable Stats / Values Freqs (% of Valid) Valid Missing
1 research_ID [numeric]
Mean (sd) : 4.2 (2)
min ≤ med ≤ max:
1 ≤ 4 ≤ 7
IQR (CV) : 3 (0.5)
1:46(12.1%)
2:36(9.4%)
3:70(18.4%)
4:87(22.8%)
5:25(6.6%)
6:22(5.8%)
7:95(24.9%)
381 (100.0%) 0 (0.0%)
2 Seq_region [factor]
1. V3-V4
2. V4
3. V5-V6
106(27.8%)
229(60.1%)
46(12.1%)
381 (100.0%) 0 (0.0%)
3 Seq_date [numeric]
Mean (sd) : 2018.8 (2.9)
min ≤ med ≤ max:
2015 ≤ 2018 ≤ 2023
IQR (CV) : 4 (0)
2015:95(24.9%)
2017:25(6.6%)
2018:96(25.2%)
2020:39(10.2%)
2021:32(8.4%)
2022:23(6.0%)
2023:71(18.6%)
381 (100.0%) 0 (0.0%)
4 Health_state [factor]
1. Disease
2. Health
170(44.6%)
211(55.4%)
381 (100.0%) 0 (0.0%)
5 Age [numeric]
Mean (sd) : 38.7 (13.8)
min ≤ med ≤ max:
19 ≤ 34 ≤ 76
IQR (CV) : 19.8 (0.4)
45 distinct values 134 (35.2%) 247 (64.8%)
6 Age_range [factor]
1. > 43
2. 16-43
44(15.4%)
242(84.6%)
286 (75.1%) 95 (24.9%)
7 Sex [factor]
1. female
2. male
79(41.4%)
112(58.6%)
191 (50.1%) 190 (49.9%)
8 Country [factor]
1. Australia
2. Austria
3. Belgium
4. Germany
5. Greece
6. Hungary
7. Ireland
8. Israel
9. Italy
10. Norway
[ 4 others ]
6(1.6%)
28(7.3%)
46(12.1%)
45(11.8%)
1(0.3%)
2(0.5%)
1(0.3%)
23(6.0%)
37(9.7%)
1(0.3%)
191(50.1%)
381 (100.0%) 0 (0.0%)
9 Race [factor]
1. African American
2. Asian or Pacific Islander
3. Caucasian
4. Hispanic
5. Other
1(0.8%)
1(0.8%)
116(89.2%)
1(0.8%)
11(8.5%)
130 (34.1%) 251 (65.9%)
10 Smoking [factor]
1. No
2. Yes
187(92.1%)
16(7.9%)
203 (53.3%) 178 (46.7%)
11 Alcohol [factor]
1. Never
2. Occasionally (1-2 times/w
3. Rarely (a few times/month
4. Regularly (3-7 times/week
12(13.8%)
24(27.6%)
36(41.4%)
15(17.2%)
87 (22.8%) 294 (77.2%)
12 Antibiotics_usage [factor]
1. 1-6 months
2. 12 months/Not use
48(27.1%)
129(72.9%)
177 (46.5%) 204 (53.5%)
13 Physical_activity [factor]
1. No
2. Yes
187(92.1%)
16(7.9%)
203 (53.3%) 178 (46.7%)
14 Travel_period [factor]
1. 3 months
2. 6 months
3. Month
4. Year
18(20.9%)
10(11.6%)
13(15.1%)
45(52.3%)
86 (22.6%) 295 (77.4%)
15 Education_level [factor]
1. Bachelor's level
2. Master's level
3. School Level
31(36.9%)
28(33.3%)
25(29.8%)
84 (22.0%) 297 (78.0%)
16 Cosmetics [factor]
1. Never
2. Occasionally (a few-8 tim
3. Regularly (3-7 times/week
44(51.2%)
17(19.8%)
25(29.1%)
86 (22.6%) 295 (77.4%)
17 Sleep_duration [factor]
1. > 8 hours
2. 4-6 hours
3. 6-7 hours
4. 7-8 hours
9(10.3%)
9(10.3%)
31(35.6%)
38(43.7%)
87 (22.8%) 294 (77.2%)
18 Diet_type [factor]
1. 10.0
2. 5.0
13(72.2%)
5(27.8%)
18 (4.7%) 363 (95.3%)
19 Diet_duration [factor]
1. 3.0
2. 6.0
8(61.5%)
5(38.5%)
13 (3.4%) 368 (96.6%)
20 Additive_usage [factor]
1. 22.0
2. 23.0
3. 4.0
1(9.1%)
1(9.1%)
9(81.8%)
11 (2.9%) 370 (97.1%)
21 BMI_category [factor]
1. normal/overweight
2. obese
3. underweight
287(98.0%)
5(1.7%)
1(0.3%)
293 (76.9%) 88 (23.1%)

Generated by summarytools 1.0.1 (R version 4.3.2)
2024-02-11

combined_info %>% 
  select (research_ID, Country, Seq_date, Seq_region, Health_state) %>% 
  mutate(Country = if_else (research_ID == 4, "Australia, Austria, Germany, Greece, Hungary, Ireland, Israel, Italy, Norway, UK, USA", Country),
         Seq_date = ifelse (research_ID == 4, "2018-2023", Seq_date),
         Health_state = if_else (research_ID == 4, "Disease/Health", Health_state)) %>% 
  group_by(research_ID, Country, Seq_date, Seq_region, Health_state) %>% 
  summarise(n = n()) %>% 
  flextable() %>% theme_box() %>% 
  merge_v("Health_state") %>% 
  align(align = "center", part = "all") %>% 
  set_caption("General characteristics of the included studies")
General characteristics of the included studies

research_ID

Country

Seq_date

Seq_region

Health_state

n

1

Belgium

2018

V5-V6

Health

46

2

Italy

2018

V3-V4

36

3

Poland

2023

V3-V4

70

4

Australia, Austria, Germany, Greece, Hungary, Ireland, Israel, Italy, Norway, UK, USA

2018-2023

V4

Disease/Health

87

5

Austria

2017

V4

Disease

25

6

Israel

2022

V4

22

7

Spain

2015

V4

95

tbl_summary of combined_info

library(gtsummary)

combined_info %>% 
  select(! c(patient_ID,
              Diet_duration, Additive_usage, Diet_type #for all healthy subjects = NA
              )) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Health_state) %>%
  add_p()
Characteristic Disease, N = 1701 Health, N = 2111 p-value2
research_ID

<0.001
    1 0 (0%) 46 (22%)
    2 0 (0%) 36 (17%)
    3 0 (0%) 70 (33%)
    4 28 (16%) 59 (28%)
    5 25 (15%) 0 (0%)
    6 22 (13%) 0 (0%)
    7 95 (56%) 0 (0%)
Seq_region

<0.001
    V3-V4 0 (0%) 106 (50%)
    V4 170 (100%) 59 (28%)
    V5-V6 0 (0%) 46 (22%)
Seq_date

<0.001
    2015 95 (56%) 0 (0%)
    2017 25 (15%) 0 (0%)
    2018 14 (8%) 82 (39%)
    2020 3 (2%) 36 (17%)
    2021 9 (5%) 23 (11%)
    2022 23 (14%) 0 (0%)
    2023 1 (1%) 70 (33%)
Age 42 (32, 50) 28 (26, 37) <0.001
    Unknown 95 152
Age_range

<0.001
    > 43 34 (45%) 10 (5%)
    16-43 41 (55%) 201 (95%)
    Unknown 95 0
Sex

0.15
    female 35 (48%) 44 (37%)
    male 38 (52%) 74 (63%)
    Unknown 97 93
Country


    Australia 0 (0%) 6 (3%)
    Austria 25 (15%) 3 (1%)
    Belgium 0 (0%) 46 (22%)
    Germany 0 (0%) 45 (21%)
    Greece 0 (0%) 1 (0%)
    Hungary 0 (0%) 2 (1%)
    Ireland 1 (1%) 0 (0%)
    Israel 22 (13%) 1 (0%)
    Italy 0 (0%) 37 (18%)
    Norway 1 (1%) 0 (0%)
    Poland 0 (0%) 70 (33%)
    Spain 95 (56%) 0 (0%)
    UK 12 (7%) 0 (0%)
    USA 14 (8%) 0 (0%)
Race

0.4
    African American 0 (0%) 1 (1%)
    Asian or Pacific Islander 0 (0%) 1 (1%)
    Caucasian 27 (100%) 89 (86%)
    Hispanic 0 (0%) 1 (1%)
    Other 0 (0%) 11 (11%)
    Unknown 143 108
Smoking 4 (14%) 12 (7%) 0.2
    Unknown 142 36
Alcohol

0.002
    Never 4 (14%) 8 (14%)
    Occasionally (1-2 times/week) 4 (14%) 20 (34%)
    Rarely (a few times/month) 9 (32%) 27 (46%)
    Regularly (3-7 times/week) 11 (39%) 4 (7%)
    Unknown 142 152
Antibiotics_usage

0.020
    1-6 months 2 (8%) 46 (30%)
    12 months/Not use 23 (92%) 106 (70%)
    Unknown 145 59
Physical_activity 4 (14%) 12 (7%) 0.2
    Unknown 142 36
Travel_period

0.066
    3 months 3 (11%) 15 (26%)
    6 months 1 (4%) 9 (16%)
    Month 4 (14%) 9 (16%)
    Year 20 (71%) 25 (43%)
    Unknown 142 153
Education_level

<0.001
    Bachelor's level 5 (18%) 26 (46%)
    Master's level 18 (64%) 10 (18%)
    School Level 5 (18%) 20 (36%)
    Unknown 142 155
Cosmetics

0.6
    Never 15 (56%) 29 (49%)
    Occasionally (a few-8 times/month) 6 (22%) 11 (19%)
    Regularly (3-7 times/week) 6 (22%) 19 (32%)
    Unknown 143 152
Sleep_duration

0.2
    > 8 hours 4 (14%) 5 (8%)
    4-6 hours 4 (14%) 5 (8%)
    6-7 hours 12 (43%) 19 (32%)
    7-8 hours 8 (29%) 30 (51%)
    Unknown 142 152
BMI_category

0.012
    normal/overweight 135 (96%) 152 (100%)
    obese 5 (4%) 0 (0%)
    underweight 1 (1%) 0 (0%)
    Unknown 29 59
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test

###combined_bacteria and data_wide (not batched) datasets

bacteria_healthy <- read_csv("data/raw/final_bacteria_health.csv") #abundance table for healthy
bacteria_ibs <- read_csv("data/raw/final_bacteria_ibs_141.csv") #abundance table for healthy

combined_bacteria <- bacteria_healthy %>% 
  bind_rows (bacteria_ibs) %>% 
  mutate(patient_ID = row_number())

rm (bacteria_healthy, bacteria_ibs)

# Combining `combined_info` and `combined_bacteria` into `data_wide`
data_wide <- combined_info %>% left_join (combined_bacteria)

Estimation of the NA-proportion and zero values

data_wide %>% 
  select (where (function(x) sum (is.na(x))/ nrow(data_wide) * 100 > 0)) %>% 
  sapply (function(x) sum (is.na(x))/ nrow(data_wide) * 100) %>% round(1) %>% 
  as.data.frame() %>% 
  rename(NA_percentage = ".") %>% 
  mutate (
    "Number of people with known data" = round (nrow(data_wide) - NA_percentage/100 * nrow(data_wide)),
    NA_percentage = paste (NA_percentage, "%", sep = " ")
    ) %>% 
  arrange(desc (NA_percentage)) %>% 
  rownames_to_column() %>% 
  as_tibble() %>% flextable()

rowname

NA_percentage

Number of people with known data

Additive_usage

97.1 %

11

Diet_duration

96.6 %

13

Diet_type

95.3 %

18

Education_level

78 %

84

Travel_period

77.4 %

86

Cosmetics

77.4 %

86

Alcohol

77.2 %

87

Sleep_duration

77.2 %

87

Race

65.9 %

130

Age

64.8 %

134

Antibiotics_usage

53.5 %

177

Sex

49.9 %

191

Smoking

46.7 %

203

Physical_activity

46.7 %

203

Age_range

24.9 %

286

BMI_category

23.1 %

293

  • Estimation of the NA-proportion and zero values in variables of data_wide
# Устанавливаем порог процента 
threshold_percent <- 95 
 
# Функция для вычисления процента записей, не равных NA и не равных 0, для каждой колонки 
calculate_percentage <- function(col) { 
  sum(!is.na(col) & col != 0) / length(col) * 100 
} 
 
# Применяем функцию к каждой колонке в датасете 
percentage_non_zero_non_na <- sapply(data_wide[, -1], calculate_percentage) 
 
# Создаем датафрейм с результатами 
result_df_sort <- data.frame( 
  column = names(percentage_non_zero_non_na), 
  percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% 
  arrange(desc(percentage))
 
# Отфильтровываем колонки, у которых процент записей менее threshold_percent% 
filtered_columns <- result_df_sort[result_df_sort$percentage < threshold_percent, ]

# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_columns, 
           file = "data/originals/percentage_by_vars.xlsx")

# Перезапись data_wide с выбором колонок с процентом NA/0 менее threshold_percent 
data_wide <- data_wide %>% 
  select (row.names(filtered_columns), research_ID)

rm (calculate_percentage, result_df_sort)
  • Estimation of the NA-proportion and zero values in cases of data_wide
# Устанавливаем порог процента 
threshold_percent <- 95
 
# Рассчитываем процент значений, не являющихся NA и не равных 0, для каждого пациента 
percentage_non_zero_non_na <- rowMeans(!is.na(data_wide) & data_wide != 0, na.rm = TRUE) * 100 
 
# Создаем датафрейм с результатами 
result_df_sort <- data.frame( 
  patient_id = data_wide$patient_ID, 
  percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% arrange(desc(percentage))

# Отфильтровываем пациентов, у которых процент значений менее threshold_percent% 
filtered_patients <- result_df_sort[result_df_sort$percentage < threshold_percent, ] 

# Сохраним датасет в excel для дальнейшего анализа 
write.xlsx(filtered_patients,
           file = "data/originals/percentage_by_patient.xlsx") 

# Перезапись data_wide с удалением строк с процентом NA/0 более threshold_percent 
data_wide <- data_wide %>% 
  slice (filtered_patients$patient_id) #при threshold_percent = 95%, изменения data_wide не происходит, так как нет пациентов с процентом NA/0 более 95%

rm (percentage_non_zero_non_na, result_df_sort)
# Deleting columns and rows with a NA/0-percentage more than threshold_percent from `data_wide`
data_wide <- data_wide %>% 
  select (patient_ID, any_of (colnames(combined_info)), everything()) %>% 
  arrange(patient_ID)

Search for “single study taxa”

  • G-taxa
G_only_one <- data_wide %>% 
  select (research_ID, ends_with("_G")) %>%
  add_column(n = 1) %>% #for further counting the subjects number in each study
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #selecting "single study taxa"
  mutate (across(-c(1:2), function(x) x/n)) %>% 
  mutate (across(-c(1:2), function(x) round (x,3))) 

G_only_one %>% as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study G-taxa")
Mean percentage of single study G-taxa

research_ID

n

CM1G08_G

Cladosporium_G

Lentimonas_G

Micromonospora_G

Pseudosphingobacterium_G

Schizothrix LEGE 07164_G

Talaromyces_G

Rs-D38 termite group_G

Kabatiella_G

Anaerolineaceae UCG-001_G

Iamia_G

Thiohalocapsa_G

Ellin517_G

1

46

0.000

0.000

0.05

0.033

0.017

0.011

0.000

0.000

0.000

0.07

0.03

0.274

0.000

2

36

0.000

0.008

0.00

0.000

0.000

0.000

0.014

0.000

0.022

0.00

0.00

0.000

0.000

3

70

0.000

0.000

0.00

0.000

0.000

0.000

0.000

0.005

0.000

0.00

0.00

0.000

0.000

4

87

0.006

0.000

0.00

0.000

0.000

0.000

0.000

0.000

0.000

0.00

0.00

0.000

0.000

5

25

0.000

0.000

0.00

0.000

0.000

0.000

0.000

0.000

0.000

0.00

0.00

0.000

0.000

6

22

0.000

0.000

0.00

0.000

0.000

0.000

0.000

0.000

0.000

0.00

0.00

0.000

0.000

7

95

0.000

0.000

0.00

0.000

0.000

0.000

0.000

0.000

0.000

0.00

0.00

0.000

1.594

#Determining the percentage of people in whom these taxa have not been detected
G_taxon <- 'Ellin517_G'
res_ID <- 7

#data_wide %>% 
  #select (research_ID, G_taxon) %>%
  #filter (research_ID == res_ID) %>% 
  #summarise(across (-1,
              #function (x) sum (x==0)/nrow (.) * 100)) %>% 
  #rename ("Taxon_zero_percentage, %" = G_taxon) 

# CM1G08_G - 73,6% нулей
# Cladosporium_G - 94,0% нулей
# Lentimonas_G - 94,0% нулей
# Micromonospora_G - 93,2% нулей
# Pseudosphingobacterium_G - 93,2% нулей 
# Schizothrix LEGE 07164_G - 92,7% нулей 
# Talaromyces_G - 92,7% нулей 
# Rs-D38 termite group_G - 91,9% нулей 
# Kabatiella_G - 90,6% нулей 
# Anaerolineaceae UCG-001_G - 89,5% нулей 
# Iamia_G - 88,2% нулей 
# Thiohalocapsa_G - 87,9% нулей 
# Ellin517_G - 75,1% нулей 
  • F-taxa
data_wide %>% 
  select (research_ID, ends_with("_F")) %>%
  add_column(n = 1) %>% # for further counting the subjects number in each study
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #selecting "single study taxa"
  mutate (across(-c(1:2), function(x) x/n)) %>% 
  mutate (across(-c(1:2), function(x) round (x,3))) %>% 
  as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study F-taxa")
Mean percentage of single study F-taxa

research_ID

n

Didymellaceae_F

Cladosporiaceae_F

Trichocomaceae_F

type III_F

09D2Z48_F

Aureobasidiaceae_F

Iamiaceae_F

1

46

0.000

0.000

0.000

0.011

0.012

0.000

0.03

2

36

0.021

0.008

0.014

0.000

0.000

0.022

0.00

3

70

0.000

0.000

0.000

0.000

0.000

0.000

0.00

4

87

0.000

0.000

0.000

0.000

0.000

0.000

0.00

5

25

0.000

0.000

0.000

0.000

0.000

0.000

0.00

6

22

0.000

0.000

0.000

0.000

0.000

0.000

0.00

7

95

0.000

0.000

0.000

0.000

0.000

0.000

0.00

#Determining the percentage of people in whom these taxa have not been detected
#F_taxon <- 'Iamiaceae_F'
#res_ID <- 1

#data_wide %>% 
  #select (research_ID, F_taxon) %>%
  #filter (research_ID == res_ID) %>% 
  #summarise(across (-1,
              #function (x) sum (x==0)/nrow (.) * 100)) %>% 
  #rename ("Taxon_zero_percentage, %" = F_taxon) 

# Didymellaceae_F - 44,4% нулей
# Cladosporiaceae_F - 36,1% нулей
# Trichocomaceae_F - 22,2% нулей
# type III_F - 30,4% нулей
# 09D2Z48_F - 23,9% нулей
# Aureobasidiaceae_F - 0% нулей 
# Iamiaceae_F - 2,2% нулей
  • O-taxa
data_wide %>% 
  select (research_ID, ends_with("_O")) %>%
  add_column(n = 1) %>% #for further counting the subjects number in each study
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #selecting "single study taxa"
  mutate (across(-c(1:2), function(x) x/n)) %>% 
  mutate (across(-c(1:2), function(x) round (x,3))) %>% 
  as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study O-taxa")
Mean percentage of single study O-taxa

research_ID

n

Hypocreales_O

Pleosporales_O

Candidatus Abawacabacteria_O

Candidatus Peregrinibacteria_O

Capnodiales_O

Candidatus Terrybacteria_O

Eurotiales_O

Dothideales_O

eub62A3_O

LD1-PA32_O

1

46

0.000

0.000

0.009

0.008

0.000

0.008

0.000

0.000

0.013

0.051

2

36

0.062

0.037

0.000

0.000

0.012

0.000

0.014

0.027

0.000

0.000

3

70

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

4

87

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

5

25

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

6

22

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

7

95

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

0.000

#Determining the percentage of people in whom these taxa have not been detected
#O_taxon <- 'Candidatus Abawacabacteria_O'
#res_ID <- 1

#data_wide %>% 
  #select (research_ID, O_taxon) %>%
  #filter (research_ID == res_ID) %>% 
  #summarise(across (-1, function (x) sum (x==0)/nrow (.) * 100)) %>% 
  #rename ("Taxon_zero_percentage, %" = O_taxon) 

# Hypocreales_O - 41,7% нулей
# Pleosporales_O - 41,7% нулей
# Candidatus Abawacabacteria_O - 52,2% нулей
# Candidatus Peregrinibacteria_O - 52,2% нулей
# Capnodiales_O - 33,3% нулей
# Candidatus Terrybacteria_O - 39,1% нулей
# Eurotiales_O - 39,1% нулей
# Dothideales_O - 0% нулей
# eub62A3_O - 15,2% нулей
# LD1-PA32_O - 2,2% нулей
  • C-taxa
data_wide %>% 
  select (research_ID, ends_with("_C")) %>%
  add_column(n = 1) %>% #for further counting the subjects number in each study
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #selecting "single study taxa"
  mutate (across(-c(1:2), function(x) x/n)) %>% 
   mutate (across(-c(1:2), function(x) round (x,3))) %>% 
  as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study C-taxa")
Mean percentage of single study C-taxa

research_ID

n

WWE3_C

Eurotiomycetes_C

Dothideomycetes_C

1

46

0.000

0.000

0.000

2

36

0.000

0.023

0.102

3

70

0.000

0.000

0.000

4

87

0.014

0.000

0.000

5

25

0.000

0.000

0.000

6

22

0.000

0.000

0.000

7

95

0.000

0.000

0.000

#Determining the percentage of people in whom these taxa have not been detected
#С_taxon <- 'Dothideomycetes_C'
#res_ID <- 2

#data_wide %>% 
  #select (research_ID, С_taxon) %>%
  #filter (research_ID == res_ID) %>% 
  #summarise(across (-1, function (x) sum (x==0)/nrow (.) * 100)) %>% 
  #rename ("Taxon_zero_percentage, %" = С_taxon) 

# WWE3_C - 74,4% нулей
# Eurotiomycetes_C - 19,4% нулей 
# Dothideomycetes_C - 0% нулей 
  • P-taxa

“Single study P-taxa” have not been detected

   

data_wide_not_batched dataset

data_wide <- data_wide %>% 
  select (!colnames (G_only_one) [- c (1,2)]) #deleting of single study G-taxa (in this single study the taxa were present in no more than 25% of subjects)

data_wide_not_batched <- data_wide  

write_rds(data_wide, 
          file = "data/originals/data_wide_not_batched.rds")

Clustering before batch effect correction

AGNES (euclidean distance matrix)

clusters_number <- 7 #number of clusters

library(cluster)
library(factoextra)

G_scaled <- data_wide %>% 
  mutate(patient_ID = paste0("patient_", patient_ID)) %>% 
  column_to_rownames("patient_ID") %>% 
  select (ends_with("_G")) %>% 
  scale () 

agnes <- G_scaled %>% 
  dist (method = "euclidean") %>% 
  as.matrix() %>%  
  agnes( #Agglomerative clustering
    diss = TRUE, #dissimilarity matrix
    method = "ward")

fviz_dend(agnes,
          k = clusters_number,
          rect = TRUE,
          k_colors = "jco",
          show_labels = TRUE,
          label_cols = ifelse (data_wide[agnes$order,]$Health_state == "Health", "green", "red"),
          #labels colors = Health_state
          cex = 0.2,
          main = "Cluster dengrogram based on G-taxa abundance before batch effect correction",
          ylab = "")

  • Comparison of the clusters
data_wide %>% 
  add_column("Cluster" = cutree(agnes, k = clusters_number)) %>% 
  select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Cluster) %>% 
  add_p()
Characteristic 1, N = 461 2, N = 311 3, N = 1891 4, N = 231 5, N = 651 6, N = 21 7, N = 251 p-value2
research_ID







    1 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    2 0 (0%) 31 (100%) 4 (2%) 1 (4%) 0 (0%) 0 (0%) 0 (0%)
    3 0 (0%) 0 (0%) 2 (1%) 3 (13%) 65 (100%) 0 (0%) 0 (0%)
    4 0 (0%) 0 (0%) 66 (35%) 19 (83%) 0 (0%) 2 (100%) 0 (0%)
    5 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 25 (100%)
    6 0 (0%) 0 (0%) 22 (12%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    7 0 (0%) 0 (0%) 95 (50%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
Seq_region







    V3-V4 0 (0%) 31 (100%) 6 (3%) 4 (17%) 65 (100%) 0 (0%) 0 (0%)
    V4 0 (0%) 0 (0%) 183 (97%) 19 (83%) 0 (0%) 2 (100%) 25 (100%)
    V5-V6 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
Seq_date







    2015 0 (0%) 0 (0%) 95 (50%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    2017 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 25 (100%)
    2018 46 (100%) 31 (100%) 11 (6%) 7 (30%) 0 (0%) 1 (50%) 0 (0%)
    2020 0 (0%) 0 (0%) 36 (19%) 2 (9%) 0 (0%) 1 (50%) 0 (0%)
    2021 0 (0%) 0 (0%) 21 (11%) 11 (48%) 0 (0%) 0 (0%) 0 (0%)
    2022 0 (0%) 0 (0%) 23 (12%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    2023 0 (0%) 0 (0%) 3 (2%) 3 (13%) 65 (100%) 0 (0%) 0 (0%)
Health_state







    Disease 0 (0%) 0 (0%) 134 (71%) 10 (43%) 0 (0%) 1 (50%) 25 (100%)
    Health 46 (100%) 31 (100%) 55 (29%) 13 (57%) 65 (100%) 1 (50%) 0 (0%)
Age NA (NA, NA) NA (NA, NA) 32 (27, 42) 49 (43, 52) NA (NA, NA) 32 (32, 33) 39 (28, 49) 0.002
    Unknown 46 31 101 4 65 0 0
Age_range







    > 43 0 (0%) 0 (0%) 21 (22%) 13 (57%) 0 (0%) 0 (0%) 10 (40%)
    16-43 46 (100%) 31 (100%) 73 (78%) 10 (43%) 65 (100%) 2 (100%) 15 (60%)
    Unknown 0 0 95 0 0 0 0
Sex






<0.001
    female 0 (NA%) 0 (0%) 20 (42%) 5 (24%) 38 (58%) 0 (0%) 16 (64%)
    male 0 (NA%) 31 (100%) 28 (58%) 16 (76%) 27 (42%) 1 (100%) 9 (36%)
    Unknown 46 0 141 2 0 1 0
Country







    Australia 0 (0%) 0 (0%) 2 (1%) 4 (17%) 0 (0%) 0 (0%) 0 (0%)
    Austria 0 (0%) 0 (0%) 0 (0%) 2 (9%) 0 (0%) 1 (50%) 25 (100%)
    Belgium 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Germany 0 (0%) 0 (0%) 45 (24%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Greece 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Hungary 0 (0%) 0 (0%) 0 (0%) 2 (9%) 0 (0%) 0 (0%) 0 (0%)
    Ireland 0 (0%) 0 (0%) 0 (0%) 1 (4%) 0 (0%) 0 (0%) 0 (0%)
    Israel 0 (0%) 0 (0%) 23 (12%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Italy 0 (0%) 31 (100%) 4 (2%) 2 (9%) 0 (0%) 0 (0%) 0 (0%)
    Norway 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Poland 0 (0%) 0 (0%) 2 (1%) 3 (13%) 65 (100%) 0 (0%) 0 (0%)
    Spain 0 (0%) 0 (0%) 95 (50%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    UK 0 (0%) 0 (0%) 8 (4%) 3 (13%) 0 (0%) 1 (50%) 0 (0%)
    USA 0 (0%) 0 (0%) 8 (4%) 6 (26%) 0 (0%) 0 (0%) 0 (0%)
Race






0.016
    African American 0 (0%) 0 (NA%) 1 (2%) 0 (0%) 0 (NA%) 0 (0%) 0 (NA%)
    Asian or Pacific Islander 0 (0%) 0 (NA%) 1 (2%) 0 (0%) 0 (NA%) 0 (0%) 0 (NA%)
    Caucasian 46 (100%) 0 (NA%) 49 (78%) 19 (100%) 0 (NA%) 2 (100%) 0 (NA%)
    Hispanic 0 (0%) 0 (NA%) 1 (2%) 0 (0%) 0 (NA%) 0 (0%) 0 (NA%)
    Other 0 (0%) 0 (NA%) 11 (17%) 0 (0%) 0 (NA%) 0 (0%) 0 (NA%)
    Unknown 0 31 126 4 65 0 25
Smoking 0 (0%) 0 (NA%) 14 (21%) 2 (9%) 0 (0%) 0 (0%) 0 (NA%) <0.001
    Unknown 0 31 121 1 0 0 25
Alcohol






0.8
    Never 0 (NA%) 0 (NA%) 8 (12%) 4 (21%) 0 (NA%) 0 (0%) 0 (NA%)
    Occasionally (1-2 times/week) 0 (NA%) 0 (NA%) 20 (30%) 4 (21%) 0 (NA%) 0 (0%) 0 (NA%)
    Rarely (a few times/month) 0 (NA%) 0 (NA%) 26 (39%) 8 (42%) 0 (NA%) 2 (100%) 0 (NA%)
    Regularly (3-7 times/week) 0 (NA%) 0 (NA%) 12 (18%) 3 (16%) 0 (NA%) 0 (0%) 0 (NA%)
    Unknown 46 31 123 4 65 0 25
Antibiotics_usage






<0.001
    1-6 months 46 (100%) 0 (0%) 1 (5%) 1 (8%) 0 (0%) 0 (0%) 0 (NA%)
    12 months/Not use 0 (0%) 31 (100%) 21 (95%) 11 (92%) 65 (100%) 1 (100%) 0 (NA%)
    Unknown 0 0 167 11 0 1 25
Physical_activity 0 (0%) 0 (NA%) 14 (21%) 2 (9%) 0 (0%) 0 (0%) 0 (NA%) <0.001
    Unknown 0 31 121 1 0 0 25
Travel_period






0.019
    3 months 0 (NA%) 0 (NA%) 12 (18%) 6 (33%) 0 (NA%) 0 (0%) 0 (NA%)
    6 months 0 (NA%) 0 (NA%) 10 (15%) 0 (0%) 0 (NA%) 0 (0%) 0 (NA%)
    Month 0 (NA%) 0 (NA%) 7 (11%) 4 (22%) 0 (NA%) 2 (100%) 0 (NA%)
    Year 0 (NA%) 0 (NA%) 37 (56%) 8 (44%) 0 (NA%) 0 (0%) 0 (NA%)
    Unknown 46 31 123 5 65 0 25
Education_level






0.006
    Bachelor's level 0 (NA%) 0 (NA%) 26 (41%) 5 (28%) 0 (NA%) 0 (0%) 0 (NA%)
    Master's level 0 (NA%) 0 (NA%) 15 (23%) 11 (61%) 0 (NA%) 2 (100%) 0 (NA%)
    School Level 0 (NA%) 0 (NA%) 23 (36%) 2 (11%) 0 (NA%) 0 (0%) 0 (NA%)
    Unknown 46 31 125 5 65 0 25
Cosmetics






0.2
    Never 0 (NA%) 0 (NA%) 30 (46%) 13 (68%) 0 (NA%) 1 (50%) 0 (NA%)
    Occasionally (a few-8 times/month) 0 (NA%) 0 (NA%) 16 (25%) 1 (5%) 0 (NA%) 0 (0%) 0 (NA%)
    Regularly (3-7 times/week) 0 (NA%) 0 (NA%) 19 (29%) 5 (26%) 0 (NA%) 1 (50%) 0 (NA%)
    Unknown 46 31 124 4 65 0 25
Sleep_duration






0.2
    > 8 hours 0 (NA%) 0 (NA%) 6 (9%) 3 (16%) 0 (NA%) 0 (0%) 0 (NA%)
    4-6 hours 0 (NA%) 0 (NA%) 5 (8%) 4 (21%) 0 (NA%) 0 (0%) 0 (NA%)
    6-7 hours 0 (NA%) 0 (NA%) 22 (33%) 8 (42%) 0 (NA%) 1 (50%) 0 (NA%)
    7-8 hours 0 (NA%) 0 (NA%) 33 (50%) 4 (21%) 0 (NA%) 1 (50%) 0 (NA%)
    Unknown 46 31 123 4 65 0 25
BMI_category






0.10
    normal/overweight 46 (100%) 31 (100%) 133 (97%) 11 (85%) 65 (100%) 1 (100%) 0 (NA%)
    obese 0 (0%) 0 (0%) 3 (2%) 2 (15%) 0 (0%) 0 (0%) 0 (NA%)
    underweight 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Unknown 0 0 52 10 0 1 25
1 n (%); Median (IQR)
2 Kruskal-Wallis rank sum test; Fisher’s exact test

AGNES (distance matrix for Bray-Curtis dissimilarity)

agnes2 <- data_wide %>% 
  column_to_rownames("patient_ID") %>% 
  select (ends_with("_G")) %>% 
  vegdist(method = "bray") %>% #distance matrix for Bray-Curtis dissimilarity:
  #the Bray–Curtis dissimilarity is bounded between 0 and 1, where 0 means the two sites have the same composition , and 1 means the two sites do not share any species
  as.matrix() %>% 
  agnes( #Agglomerative clustering 
    diss = TRUE, #dissimilarity matrix
    method = "ward")

fviz_dend(agnes2, 
          cex = 0.2, k = clusters_number,
          rect = TRUE,
          k_colors = "jco",
          color_labels_by_k = FALSE,
          label_cols = ifelse (data_wide[agnes2$order,]$Health_state == "Health", "green", "red"),
          #labels colors = Health_state
          main = "Cluster dengrogram based on G-taxa abundance before batch effect correction",
          ylab = "")

  • Comparison of the clusters
fisher.test.simulate.p.values <- function(data, variable, by, ...) {
  result <- list()
  test_results <- stats::fisher.test(data[[variable]], data[[by]], simulate.p.value = TRUE)
  result$p <- test_results$p.value
  result$test <- test_results$method
  result
}

data_wide %>% 
  add_column("Cluster" = cutree(agnes2, k = clusters_number)) %>%
  select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Cluster) %>% 
  add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
Characteristic 1, N = 461 2, N = 701 3, N = 1211 4, N = 411 5, N = 461 6, N = 211 7, N = 361 p-value2
research_ID






<0.001
    1 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    2 0 (0%) 19 (27%) 5 (4%) 8 (20%) 3 (7%) 1 (5%) 0 (0%)
    3 0 (0%) 25 (36%) 26 (21%) 8 (20%) 9 (20%) 0 (0%) 2 (6%)
    4 0 (0%) 6 (9%) 36 (30%) 0 (0%) 9 (20%) 20 (95%) 16 (44%)
    5 0 (0%) 2 (3%) 10 (8%) 0 (0%) 0 (0%) 0 (0%) 13 (36%)
    6 0 (0%) 4 (6%) 9 (7%) 3 (7%) 4 (9%) 0 (0%) 2 (6%)
    7 0 (0%) 14 (20%) 35 (29%) 22 (54%) 21 (46%) 0 (0%) 3 (8%)
Seq_region






<0.001
    V3-V4 0 (0%) 44 (63%) 31 (26%) 16 (39%) 12 (26%) 1 (5%) 2 (6%)
    V4 0 (0%) 26 (37%) 90 (74%) 25 (61%) 34 (74%) 20 (95%) 34 (94%)
    V5-V6 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
Seq_date






<0.001
    2015 0 (0%) 14 (20%) 35 (29%) 22 (54%) 21 (46%) 0 (0%) 3 (8%)
    2017 0 (0%) 2 (3%) 10 (8%) 0 (0%) 0 (0%) 0 (0%) 13 (36%)
    2018 46 (100%) 21 (30%) 8 (7%) 8 (20%) 5 (11%) 7 (33%) 1 (3%)
    2020 0 (0%) 2 (3%) 20 (17%) 0 (0%) 5 (11%) 2 (10%) 10 (28%)
    2021 0 (0%) 2 (3%) 11 (9%) 0 (0%) 2 (4%) 12 (57%) 5 (14%)
    2022 0 (0%) 4 (6%) 10 (8%) 3 (7%) 4 (9%) 0 (0%) 2 (6%)
    2023 0 (0%) 25 (36%) 27 (22%) 8 (20%) 9 (20%) 0 (0%) 2 (6%)
Health_state






<0.001
    Disease 0 (0%) 23 (33%) 63 (52%) 25 (61%) 29 (63%) 10 (48%) 20 (56%)
    Health 46 (100%) 47 (67%) 58 (48%) 16 (39%) 17 (37%) 11 (52%) 16 (44%)
Age NA (NA, NA) 36 (29, 47) 33 (27, 48) 35 (30, 41) 32 (29, 38) 48 (42, 56) 31 (26, 41) 0.010
    Unknown 46 58 66 38 33 1 5
Age_range






<0.001
    > 43 0 (0%) 5 (9%) 15 (17%) 1 (5%) 3 (12%) 13 (62%) 7 (21%)
    16-43 46 (100%) 51 (91%) 71 (83%) 18 (95%) 22 (88%) 8 (38%) 26 (79%)
    Unknown 0 14 35 22 21 0 3
Sex






0.027
    female 0 (NA%) 18 (34%) 36 (59%) 4 (21%) 7 (37%) 6 (32%) 8 (40%)
    male 0 (NA%) 35 (66%) 25 (41%) 15 (79%) 12 (63%) 13 (68%) 12 (60%)
    Unknown 46 17 60 22 27 2 16
Country






<0.001
    Australia 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%) 4 (19%) 1 (3%)
    Austria 0 (0%) 2 (3%) 10 (8%) 0 (0%) 0 (0%) 2 (10%) 14 (39%)
    Belgium 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Germany 0 (0%) 3 (4%) 25 (21%) 0 (0%) 5 (11%) 0 (0%) 12 (33%)
    Greece 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (5%) 0 (0%)
    Hungary 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (10%) 0 (0%)
    Ireland 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (5%) 0 (0%)
    Israel 0 (0%) 4 (6%) 10 (8%) 3 (7%) 4 (9%) 0 (0%) 2 (6%)
    Italy 0 (0%) 19 (27%) 5 (4%) 8 (20%) 3 (7%) 2 (10%) 0 (0%)
    Norway 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Poland 0 (0%) 25 (36%) 26 (21%) 8 (20%) 9 (20%) 0 (0%) 2 (6%)
    Spain 0 (0%) 14 (20%) 35 (29%) 22 (54%) 21 (46%) 0 (0%) 3 (8%)
    UK 0 (0%) 2 (3%) 1 (1%) 0 (0%) 4 (9%) 4 (19%) 1 (3%)
    USA 0 (0%) 1 (1%) 7 (6%) 0 (0%) 0 (0%) 5 (24%) 1 (3%)
Race






<0.001
    African American 0 (0%) 0 (0%) 1 (3%) 0 (NA%) 0 (0%) 0 (0%) 0 (0%)
    Asian or Pacific Islander 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 1 (11%) 0 (0%) 0 (0%)
    Caucasian 46 (100%) 6 (100%) 28 (80%) 0 (NA%) 7 (78%) 20 (100%) 9 (64%)
    Hispanic 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (0%) 0 (0%) 1 (7%)
    Other 0 (0%) 0 (0%) 6 (17%) 0 (NA%) 1 (11%) 0 (0%) 4 (29%)
    Unknown 0 64 86 41 37 1 22
Smoking 0 (0%) 2 (6%) 7 (11%) 0 (0%) 1 (6%) 3 (15%) 3 (17%) 0.080
    Unknown 0 39 59 33 28 1 18
Alcohol






0.7
    Never 0 (NA%) 0 (0%) 5 (14%) 0 (NA%) 1 (11%) 5 (25%) 1 (6%)
    Occasionally (1-2 times/week) 0 (NA%) 1 (17%) 12 (33%) 0 (NA%) 1 (11%) 4 (20%) 6 (38%)
    Rarely (a few times/month) 0 (NA%) 4 (67%) 12 (33%) 0 (NA%) 4 (44%) 8 (40%) 8 (50%)
    Regularly (3-7 times/week) 0 (NA%) 1 (17%) 7 (19%) 0 (NA%) 3 (33%) 3 (15%) 1 (6%)
    Unknown 46 64 85 41 37 1 20
Antibiotics_usage






<0.001
    1-6 months 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (11%) 1 (25%)
    12 months/Not use 0 (0%) 47 (100%) 39 (100%) 16 (100%) 16 (100%) 8 (89%) 3 (75%)
    Unknown 0 23 82 25 30 12 32
Physical_activity 0 (0%) 2 (6%) 7 (11%) 0 (0%) 1 (6%) 3 (15%) 3 (17%) 0.089
    Unknown 0 39 59 33 28 1 18
Travel_period






0.051
    3 months 0 (NA%) 0 (0%) 10 (28%) 0 (NA%) 1 (11%) 5 (26%) 2 (13%)
    6 months 0 (NA%) 0 (0%) 8 (22%) 0 (NA%) 1 (11%) 0 (0%) 1 (6%)
    Month 0 (NA%) 0 (0%) 2 (6%) 0 (NA%) 1 (11%) 4 (21%) 6 (38%)
    Year 0 (NA%) 6 (100%) 16 (44%) 0 (NA%) 6 (67%) 10 (53%) 7 (44%)
    Unknown 46 64 85 41 37 2 20
Education_level






0.15
    Bachelor's level 0 (NA%) 1 (20%) 11 (31%) 0 (NA%) 3 (33%) 6 (32%) 10 (63%)
    Master's level 0 (NA%) 1 (20%) 10 (29%) 0 (NA%) 4 (44%) 10 (53%) 3 (19%)
    School Level 0 (NA%) 3 (60%) 14 (40%) 0 (NA%) 2 (22%) 3 (16%) 3 (19%)
    Unknown 46 65 86 41 37 2 20
Cosmetics






0.8
    Never 0 (NA%) 3 (50%) 18 (50%) 0 (NA%) 4 (50%) 12 (60%) 7 (44%)
    Occasionally (a few-8 times/month) 0 (NA%) 2 (33%) 6 (17%) 0 (NA%) 3 (38%) 2 (10%) 4 (25%)
    Regularly (3-7 times/week) 0 (NA%) 1 (17%) 12 (33%) 0 (NA%) 1 (13%) 6 (30%) 5 (31%)
    Unknown 46 64 85 41 38 1 20
Sleep_duration






0.2
    > 8 hours 0 (NA%) 2 (33%) 2 (6%) 0 (NA%) 1 (11%) 3 (15%) 1 (6%)
    4-6 hours 0 (NA%) 0 (0%) 2 (6%) 0 (NA%) 1 (11%) 4 (20%) 2 (13%)
    6-7 hours 0 (NA%) 2 (33%) 12 (33%) 0 (NA%) 1 (11%) 9 (45%) 7 (44%)
    7-8 hours 0 (NA%) 2 (33%) 20 (56%) 0 (NA%) 6 (67%) 4 (20%) 6 (38%)
    Unknown 46 64 85 41 37 1 20
BMI_category






0.032
    normal/overweight 46 (100%) 64 (98%) 80 (98%) 41 (100%) 39 (98%) 8 (80%) 9 (100%)
    obese 0 (0%) 1 (2%) 2 (2%) 0 (0%) 0 (0%) 2 (20%) 0 (0%)
    underweight 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (3%) 0 (0%) 0 (0%)
    Unknown 0 5 39 0 6 11 27
1 n (%); Median (IQR)
2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test

kMeans

kmeans <- G_scaled %>% 
  kmeans(centers = clusters_number,
         iter.max = 20, nstart = 35) 

  fviz_cluster(kmeans,data = G_scaled,
               ellipse = FALSE,
               show.clust.cent = FALSE,
               geom = "point",
               main = "kMeans clustering based on G-taxa abundance before batch effect correction")

Batch-effects correction

the MBECS package was used, a step-by-step actions description is available at the link https://github.com/rmolbrich/MBECS

Preliminary report is available in /data/originals/.

  • Correction of research_ID batch-effect
#Run corrections
mbec.obj <- mbecRunCorrections(
  mbec.obj, 
  model.vars=c('research_ID', 'Health_state'),
#model.vars=c (batch effect, presumed biological effect of interest) 
method = "bat", #batch effect correcting algorithm:
#Batch Mean Centering (bmc) /ComBat (bat)/Remove Batch Effect (rbe)
type = "otu") #which abundance matrix to use
## Found 459 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
#Post report
#mbecReportPost(
  #input.obj=mbec.obj, 
  #model.vars=c('research_ID', 'Health_state'), #model.vars=c (batch effect, presumed biological effect of interest) 
#type="otu", # which abundance matrix to use for evaluation: clr/otu/tss
#file.name = "Batch_(research_id)_Correction_Report_otu_bat",
#file.dir = "data/originals/",
#return.data = FALSE)

library (datawizard)

#Retrieve corrrected data
ps.cor.bat <- mbecGetPhyloseq(mbec.obj, 
                          type="cor", #which type of data to add
                          label="bat") #specifies the name within the list

combined_bacteria_batched <- ps.cor.bat@otu_table %>% 
  as.data.frame() %>% 
  data_rotate (rownames = "patient_ID") %>% 
  mutate (patient_ID = sub ("S", "", patient_ID),
          patient_ID = as.numeric(patient_ID))

data_wide_batched <- data_wide %>% 
  select (any_of(colnames (combined_info))) %>% 
  left_join(combined_bacteria_batched)

rm (combined_bacteria_batched)

data_wide <- data_wide_batched  

write_rds(data_wide, 
          file = "data/originals/data_wide.rds")

####Principal Component Analysis

  • before the batch effect correction
plot <- mbecPCA(input.obj=mbec.obj, 
        model.vars=c('research_ID', 'Health_state'),
        type="otu", pca.axes=c(1,2), return.data = FALSE) 

ggsave("data/pictures/PCA_befor_batching.jpeg", plot = plot)
  • after the batch effect correction
plot <- mbecPCA(input.obj=mbec.obj, 
        model.vars=c('research_ID', 'Health_state'),
        type="cor", label = "bat",
        pca.axes=c(1,2), return.data = FALSE)

ggsave("data/pictures/PCA_after_batching.jpeg", plot = plot)

####Heatmap of the top 10 most variable taxa

  • before the batch effect correction
mbecHeat(input.obj=mbec.obj, method = "TOP", n = 10, 
         model.vars=c('research_ID', 'Health_state'),
         center = TRUE, scale = TRUE, 
         type="otu",
         return.data = FALSE)

  • after the batch effect correction
mbecHeat(input.obj=mbec.obj, method = "TOP", n = 10, 
         model.vars=c('research_ID', 'Health_state'),
         center = TRUE, scale = TRUE, 
         type="cor", label = "bat", 
         return.data = FALSE)

AGNES (euclidean distance matrix) after the batch effect correction

G_scaled <- data_wide_batched %>% 
  mutate(patient_ID = paste0("patient_", patient_ID)) %>% 
  column_to_rownames("patient_ID") %>% 
  select (ends_with("_G")) %>% 
  scale () 

agnes <- G_scaled %>% 
  dist (method = "euclidean") %>%
  as.matrix() %>%  
  agnes( #gglomerative clustering
    diss = TRUE, #dissimilarity matrix
    method = "ward")

fviz_dend(agnes,
          k = clusters_number,
          rect = TRUE,
          k_colors = "jco",
          show_labels = TRUE,
          label_cols = ifelse (data_wide_batched[agnes$order,]$Health_state == "Health", "green", "red"),#labels color = Health_state
          cex = 0.2, 
          main = "Cluster dengrogram based on G-taxa abundance after batch effect correction",
          ylab = "")

  • Comparison of the clusters
data_wide %>% 
  add_column("Cluster" = cutree(agnes, k = clusters_number)) %>%
  select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Cluster) %>% 
  add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
Characteristic 1, N = 461 2, N = 1571 3, N = 961 4, N = 221 5, N = 351 6, N = 21 7, N = 231 p-value2
research_ID






<0.001
    1 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    2 0 (0%) 23 (15%) 12 (13%) 1 (5%) 0 (0%) 0 (0%) 0 (0%)
    3 0 (0%) 13 (8%) 23 (24%) 4 (18%) 30 (86%) 0 (0%) 0 (0%)
    4 0 (0%) 29 (18%) 38 (40%) 16 (73%) 2 (6%) 2 (100%) 0 (0%)
    5 0 (0%) 2 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 23 (100%)
    6 0 (0%) 18 (11%) 3 (3%) 0 (0%) 1 (3%) 0 (0%) 0 (0%)
    7 0 (0%) 72 (46%) 20 (21%) 1 (5%) 2 (6%) 0 (0%) 0 (0%)
Seq_region






<0.001
    V3-V4 0 (0%) 36 (23%) 35 (36%) 5 (23%) 30 (86%) 0 (0%) 0 (0%)
    V4 0 (0%) 121 (77%) 61 (64%) 17 (77%) 5 (14%) 2 (100%) 23 (100%)
    V5-V6 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
Seq_date






<0.001
    2015 0 (0%) 72 (46%) 20 (21%) 1 (5%) 2 (6%) 0 (0%) 0 (0%)
    2017 0 (0%) 2 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 23 (100%)
    2018 46 (100%) 27 (17%) 16 (17%) 6 (27%) 0 (0%) 1 (50%) 0 (0%)
    2020 0 (0%) 11 (7%) 23 (24%) 2 (9%) 2 (6%) 1 (50%) 0 (0%)
    2021 0 (0%) 13 (8%) 10 (10%) 9 (41%) 0 (0%) 0 (0%) 0 (0%)
    2022 0 (0%) 19 (12%) 3 (3%) 0 (0%) 1 (3%) 0 (0%) 0 (0%)
    2023 0 (0%) 13 (8%) 24 (25%) 4 (18%) 30 (86%) 0 (0%) 0 (0%)
Health_state






<0.001
    Disease 0 (0%) 103 (66%) 31 (32%) 9 (41%) 3 (9%) 1 (50%) 23 (100%)
    Health 46 (100%) 54 (34%) 65 (68%) 13 (59%) 32 (91%) 1 (50%) 0 (0%)
Age NA (NA, NA) 35 (29, 47) 29 (26, 40) 48 (42, 51) 25 (25, 26) 32 (32, 33) 36 (28, 49) 0.004
    Unknown 46 108 55 6 32 0 0
Age_range






<0.001
    > 43 0 (0%) 15 (18%) 9 (12%) 11 (52%) 0 (0%) 0 (0%) 9 (39%)
    16-43 46 (100%) 70 (82%) 67 (88%) 10 (48%) 33 (100%) 2 (100%) 14 (61%)
    Unknown 0 72 20 1 2 0 0
Sex






0.030
    female 0 (NA%) 23 (32%) 19 (41%) 5 (26%) 18 (58%) 0 (0%) 14 (61%)
    male 0 (NA%) 48 (68%) 27 (59%) 14 (74%) 13 (42%) 1 (100%) 9 (39%)
    Unknown 46 86 50 3 4 1 0
Country






<0.001
    Australia 0 (0%) 2 (1%) 1 (1%) 3 (14%) 0 (0%) 0 (0%) 0 (0%)
    Austria 0 (0%) 2 (1%) 0 (0%) 2 (9%) 0 (0%) 1 (50%) 23 (100%)
    Belgium 46 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Germany 0 (0%) 14 (9%) 29 (30%) 0 (0%) 2 (6%) 0 (0%) 0 (0%)
    Greece 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Hungary 0 (0%) 0 (0%) 0 (0%) 2 (9%) 0 (0%) 0 (0%) 0 (0%)
    Ireland 0 (0%) 0 (0%) 0 (0%) 1 (5%) 0 (0%) 0 (0%) 0 (0%)
    Israel 0 (0%) 19 (12%) 3 (3%) 0 (0%) 1 (3%) 0 (0%) 0 (0%)
    Italy 0 (0%) 23 (15%) 12 (13%) 2 (9%) 0 (0%) 0 (0%) 0 (0%)
    Norway 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Poland 0 (0%) 13 (8%) 23 (24%) 4 (18%) 30 (86%) 0 (0%) 0 (0%)
    Spain 0 (0%) 72 (46%) 20 (21%) 1 (5%) 2 (6%) 0 (0%) 0 (0%)
    UK 0 (0%) 5 (3%) 3 (3%) 3 (14%) 0 (0%) 1 (50%) 0 (0%)
    USA 0 (0%) 6 (4%) 4 (4%) 4 (18%) 0 (0%) 0 (0%) 0 (0%)
Race






0.023
    African American 0 (0%) 0 (0%) 1 (3%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Asian or Pacific Islander 0 (0%) 0 (0%) 1 (3%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Caucasian 46 (100%) 21 (75%) 29 (81%) 16 (100%) 2 (100%) 2 (100%) 0 (NA%)
    Hispanic 0 (0%) 1 (4%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Other 0 (0%) 6 (21%) 5 (14%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Unknown 0 129 60 6 33 0 23
Smoking 0 (0%) 6 (14%) 8 (13%) 1 (5%) 1 (3%) 0 (0%) 0 (NA%) 0.045
    Unknown 0 115 35 2 3 0 23
Alcohol






0.2
    Never 0 (NA%) 7 (24%) 3 (8%) 2 (13%) 0 (0%) 0 (0%) 0 (NA%)
    Occasionally (1-2 times/week) 0 (NA%) 5 (17%) 14 (37%) 4 (25%) 1 (50%) 0 (0%) 0 (NA%)
    Rarely (a few times/month) 0 (NA%) 8 (28%) 17 (45%) 8 (50%) 1 (50%) 2 (100%) 0 (NA%)
    Regularly (3-7 times/week) 0 (NA%) 9 (31%) 4 (11%) 2 (13%) 0 (0%) 0 (0%) 0 (NA%)
    Unknown 46 128 58 6 33 0 23
Antibiotics_usage






<0.001
    1-6 months 46 (100%) 1 (2%) 0 (0%) 1 (9%) 0 (0%) 0 (0%) 0 (NA%)
    12 months/Not use 0 (0%) 45 (98%) 43 (100%) 10 (91%) 30 (100%) 1 (100%) 0 (NA%)
    Unknown 0 111 53 11 5 1 23
Physical_activity 0 (0%) 6 (14%) 8 (13%) 1 (5%) 1 (3%) 0 (0%) 0 (NA%) 0.049
    Unknown 0 115 35 2 3 0 23
Travel_period






0.009
    3 months 0 (NA%) 5 (17%) 8 (21%) 5 (33%) 0 (0%) 0 (0%) 0 (NA%)
    6 months 0 (NA%) 3 (10%) 7 (18%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Month 0 (NA%) 2 (7%) 3 (8%) 4 (27%) 2 (100%) 2 (100%) 0 (NA%)
    Year 0 (NA%) 19 (66%) 20 (53%) 6 (40%) 0 (0%) 0 (0%) 0 (NA%)
    Unknown 46 128 58 7 33 0 23
Education_level






0.10
    Bachelor's level 0 (NA%) 9 (31%) 15 (42%) 5 (33%) 2 (100%) 0 (0%) 0 (NA%)
    Master's level 0 (NA%) 11 (38%) 7 (19%) 8 (53%) 0 (0%) 2 (100%) 0 (NA%)
    School Level 0 (NA%) 9 (31%) 14 (39%) 2 (13%) 0 (0%) 0 (0%) 0 (NA%)
    Unknown 46 128 60 7 33 0 23
Cosmetics






0.13
    Never 0 (NA%) 12 (43%) 20 (53%) 11 (69%) 0 (0%) 1 (50%) 0 (NA%)
    Occasionally (a few-8 times/month) 0 (NA%) 5 (18%) 11 (29%) 1 (6%) 0 (0%) 0 (0%) 0 (NA%)
    Regularly (3-7 times/week) 0 (NA%) 11 (39%) 7 (18%) 4 (25%) 2 (100%) 1 (50%) 0 (NA%)
    Unknown 46 129 58 6 33 0 23
Sleep_duration






0.093
    > 8 hours 0 (NA%) 2 (7%) 4 (11%) 2 (13%) 1 (50%) 0 (0%) 0 (NA%)
    4-6 hours 0 (NA%) 4 (14%) 1 (3%) 4 (25%) 0 (0%) 0 (0%) 0 (NA%)
    6-7 hours 0 (NA%) 7 (24%) 15 (39%) 7 (44%) 1 (50%) 1 (50%) 0 (NA%)
    7-8 hours 0 (NA%) 16 (55%) 18 (47%) 3 (19%) 0 (0%) 1 (50%) 0 (NA%)
    Unknown 46 128 58 6 33 0 23
BMI_category






0.14
    normal/overweight 46 (100%) 133 (98%) 63 (98%) 11 (85%) 33 (100%) 1 (100%) 0 (NA%)
    obese 0 (0%) 2 (1%) 1 (2%) 2 (15%) 0 (0%) 0 (0%) 0 (NA%)
    underweight 0 (0%) 1 (1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (NA%)
    Unknown 0 21 32 9 2 1 23
1 n (%); Median (IQR)
2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test

kMeans

kmeans <- G_scaled %>% 
  kmeans(centers = clusters_number, 
         iter.max = 20, 
         nstart = 35) 

  fviz_cluster(kmeans,data = G_scaled,
               ellipse = FALSE,
               show.clust.cent = FALSE,
               geom = "point",
               main = "kMeans clustering based on G-taxa abundance after batch effect correction")

AGNES (distance matrix Bray-Curtis dissimilarity)

agnes2 <- data_wide_batched %>% 
  column_to_rownames("patient_ID") %>% 
  select (ends_with("_G")) %>% 
  vegdist(method = "bray") %>%
  as.matrix() %>% 
  agnes( #Agglomerative clustering 
    diss = TRUE, #dissimilarity matrix
    method = "ward")

fviz_dend(agnes2, 
          cex = 0.2, k = clusters_number,
          rect = TRUE,
          k_colors = "jco",
          color_labels_by_k = FALSE,
          label_cols = ifelse (data_wide[agnes2$order,]$Health_state == "Health", "green", "red"),
          #labels colors = Health_state
          main = "Cluster dengrogram based on G-taxa abundance after batch effect correction",
          ylab = "")

  • Comparison of the clusters
data_wide %>% 
  add_column("Cluster" = cutree(agnes2, k = clusters_number)) %>%
  select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Cluster) %>% 
  add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
Characteristic 1, N = 461 2, N = 211 3, N = 681 4, N = 581 5, N = 741 6, N = 951 7, N = 191 p-value2
research_ID






<0.001
    1 7 (15%) 3 (14%) 18 (26%) 11 (19%) 7 (9%) 0 (0%) 0 (0%)
    2 10 (22%) 0 (0%) 8 (12%) 3 (5%) 12 (16%) 2 (2%) 1 (5%)
    3 13 (28%) 2 (10%) 24 (35%) 11 (19%) 13 (18%) 6 (6%) 1 (5%)
    4 15 (33%) 2 (10%) 6 (9%) 9 (16%) 15 (20%) 26 (27%) 14 (74%)
    5 0 (0%) 9 (43%) 0 (0%) 2 (3%) 2 (3%) 12 (13%) 0 (0%)
    6 1 (2%) 3 (14%) 1 (1%) 4 (7%) 2 (3%) 11 (12%) 0 (0%)
    7 0 (0%) 2 (10%) 11 (16%) 18 (31%) 23 (31%) 38 (40%) 3 (16%)
Seq_region






<0.001
    V3-V4 23 (50%) 2 (10%) 32 (47%) 14 (24%) 25 (34%) 8 (8%) 2 (11%)
    V4 16 (35%) 16 (76%) 18 (26%) 33 (57%) 42 (57%) 87 (92%) 17 (89%)
    V5-V6 7 (15%) 3 (14%) 18 (26%) 11 (19%) 7 (9%) 0 (0%) 0 (0%)
Seq_date






<0.001
    2015 0 (0%) 2 (10%) 11 (16%) 18 (31%) 23 (31%) 38 (40%) 3 (16%)
    2017 0 (0%) 9 (43%) 0 (0%) 2 (3%) 2 (3%) 12 (13%) 0 (0%)
    2018 17 (37%) 3 (14%) 26 (38%) 16 (28%) 22 (30%) 7 (7%) 5 (26%)
    2020 11 (24%) 0 (0%) 4 (6%) 5 (9%) 7 (9%) 9 (9%) 3 (16%)
    2021 4 (9%) 2 (10%) 2 (3%) 2 (3%) 5 (7%) 10 (11%) 7 (37%)
    2022 1 (2%) 3 (14%) 1 (1%) 4 (7%) 2 (3%) 12 (13%) 0 (0%)
    2023 13 (28%) 2 (10%) 24 (35%) 11 (19%) 13 (18%) 7 (7%) 1 (5%)
Health_state






<0.001
    Disease 1 (2%) 15 (71%) 13 (19%) 28 (48%) 31 (42%) 74 (78%) 8 (42%)
    Health 45 (98%) 6 (29%) 55 (81%) 30 (52%) 43 (58%) 21 (22%) 11 (58%)
Age 27 (25, 37) 34 (26, 46) 32 (28, 33) 32 (28, 41) 28 (26, 46) 41 (29, 49) 47 (33, 52) 0.023
    Unknown 30 7 61 43 55 46 5
Age_range






<0.001
    > 43 2 (4%) 5 (26%) 0 (0%) 4 (10%) 6 (12%) 19 (33%) 8 (50%)
    16-43 44 (96%) 14 (74%) 57 (100%) 36 (90%) 45 (88%) 38 (67%) 8 (50%)
    Unknown 0 2 11 18 23 38 3
Sex






0.007
    female 12 (46%) 7 (47%) 14 (41%) 8 (35%) 8 (24%) 28 (62%) 2 (14%)
    male 14 (54%) 8 (53%) 20 (59%) 15 (65%) 26 (76%) 17 (38%) 12 (86%)
    Unknown 20 6 34 35 40 50 5
Country






<0.001
    Australia 2 (4%) 0 (0%) 0 (0%) 0 (0%) 1 (1%) 0 (0%) 3 (16%)
    Austria 0 (0%) 9 (43%) 0 (0%) 2 (3%) 2 (3%) 12 (13%) 3 (16%)
    Belgium 7 (15%) 3 (14%) 18 (26%) 11 (19%) 7 (9%) 0 (0%) 0 (0%)
    Germany 13 (28%) 1 (5%) 5 (7%) 5 (9%) 10 (14%) 11 (12%) 0 (0%)
    Greece 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (1%) 0 (0%)
    Hungary 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (11%)
    Ireland 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (1%) 0 (0%)
    Israel 1 (2%) 3 (14%) 1 (1%) 4 (7%) 2 (3%) 12 (13%) 0 (0%)
    Italy 10 (22%) 0 (0%) 8 (12%) 3 (5%) 12 (16%) 2 (2%) 2 (11%)
    Norway 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (1%) 0 (0%) 0 (0%)
    Poland 13 (28%) 2 (10%) 24 (35%) 11 (19%) 13 (18%) 6 (6%) 1 (5%)
    Spain 0 (0%) 2 (10%) 11 (16%) 18 (31%) 23 (31%) 38 (40%) 3 (16%)
    UK 0 (0%) 0 (0%) 0 (0%) 4 (7%) 2 (3%) 2 (2%) 4 (21%)
    USA 0 (0%) 1 (5%) 1 (1%) 0 (0%) 1 (1%) 10 (11%) 1 (5%)
Race






0.10
    African American 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (4%) 0 (0%)
    Asian or Pacific Islander 0 (0%) 0 (0%) 0 (0%) 1 (5%) 0 (0%) 0 (0%) 0 (0%)
    Caucasian 14 (70%) 5 (100%) 24 (100%) 18 (90%) 19 (86%) 22 (88%) 14 (100%)
    Hispanic 1 (5%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Other 5 (25%) 0 (0%) 0 (0%) 1 (5%) 3 (14%) 2 (8%) 0 (0%)
    Unknown 26 16 44 38 52 70 5
Smoking 2 (6%) 1 (14%) 3 (6%) 1 (3%) 5 (14%) 3 (9%) 1 (7%) 0.6
    Unknown 11 14 20 27 39 63 4
Alcohol






0.7
    Never 2 (13%) 0 (0%) 1 (17%) 1 (11%) 1 (7%) 5 (19%) 2 (14%)
    Occasionally (1-2 times/week) 5 (33%) 0 (0%) 3 (50%) 1 (11%) 5 (33%) 7 (27%) 3 (21%)
    Rarely (a few times/month) 8 (53%) 1 (50%) 1 (17%) 4 (44%) 7 (47%) 8 (31%) 7 (50%)
    Regularly (3-7 times/week) 0 (0%) 1 (50%) 1 (17%) 3 (33%) 2 (13%) 6 (23%) 2 (14%)
    Unknown 31 19 62 49 59 69 5
Antibiotics_usage






0.004
    1-6 months 7 (23%) 4 (67%) 18 (35%) 11 (38%) 7 (19%) 0 (0%) 1 (17%)
    12 months/Not use 23 (77%) 2 (33%) 33 (65%) 18 (62%) 29 (81%) 19 (100%) 5 (83%)
    Unknown 16 15 17 29 38 76 13
Physical_activity 2 (6%) 1 (14%) 3 (6%) 1 (3%) 5 (14%) 3 (9%) 1 (7%) 0.6
    Unknown 11 14 20 27 39 63 4
Travel_period






0.15
    3 months 4 (27%) 0 (0%) 2 (33%) 1 (11%) 2 (13%) 5 (19%) 4 (31%)
    6 months 2 (13%) 0 (0%) 1 (17%) 1 (11%) 4 (27%) 2 (8%) 0 (0%)
    Month 3 (20%) 0 (0%) 2 (33%) 1 (11%) 0 (0%) 2 (8%) 5 (38%)
    Year 6 (40%) 2 (100%) 1 (17%) 6 (67%) 9 (60%) 17 (65%) 4 (31%)
    Unknown 31 19 62 49 59 69 6
Education_level






0.4
    Bachelor's level 9 (60%) 1 (50%) 3 (50%) 3 (33%) 5 (38%) 6 (23%) 4 (31%)
    Master's level 2 (13%) 0 (0%) 1 (17%) 4 (44%) 3 (23%) 11 (42%) 7 (54%)
    School Level 4 (27%) 1 (50%) 2 (33%) 2 (22%) 5 (38%) 9 (35%) 2 (15%)
    Unknown 31 19 62 49 61 69 6
Cosmetics






0.2
    Never 5 (33%) 2 (100%) 1 (17%) 4 (50%) 9 (60%) 13 (50%) 10 (71%)
    Occasionally (a few-8 times/month) 3 (20%) 0 (0%) 2 (33%) 3 (38%) 3 (20%) 6 (23%) 0 (0%)
    Regularly (3-7 times/week) 7 (47%) 0 (0%) 3 (50%) 1 (13%) 3 (20%) 7 (27%) 4 (29%)
    Unknown 31 19 62 50 59 69 5
Sleep_duration






0.3
    > 8 hours 3 (20%) 0 (0%) 0 (0%) 1 (11%) 2 (13%) 1 (4%) 2 (14%)
    4-6 hours 1 (7%) 1 (50%) 0 (0%) 1 (11%) 1 (7%) 1 (4%) 4 (29%)
    6-7 hours 5 (33%) 1 (50%) 2 (33%) 1 (11%) 5 (33%) 12 (46%) 5 (36%)
    7-8 hours 6 (40%) 0 (0%) 4 (67%) 6 (67%) 7 (47%) 12 (46%) 3 (21%)
    Unknown 31 19 62 49 59 69 5
BMI_category






0.8
    normal/overweight 30 (97%) 11 (100%) 62 (98%) 49 (98%) 60 (98%) 66 (97%) 9 (100%)
    obese 1 (3%) 0 (0%) 1 (2%) 0 (0%) 1 (2%) 2 (3%) 0 (0%)
    underweight 0 (0%) 0 (0%) 0 (0%) 1 (2%) 0 (0%) 0 (0%) 0 (0%)
    Unknown 15 10 5 8 13 27 10
1 n (%); Median (IQR)
2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test

bac_functions dataset

path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")

neuromediators <- read_xlsx (path, 2) %>% 
  mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>% 
  unique() %>%
  pivot_wider(names_from = Neuromediator, values_from = Destroy)

probiotics <- read_xlsx (path, 3) %>% 
  add_column(probiotics = 1)

special_properties <- read_xlsx (path, 4) %>% 
  add_column(special_properties = 1)

vitamins <- read_xlsx (path, 5) %>% 
  pivot_wider (names_from = Vitamin, values_from = Vitamin,
               values_fn = function(x) ifelse(is.na (x), NA, 1))

habbits <- read_xlsx (path, 7) %>%
  unique() %>% #deleting duplicate rows
  pivot_wider(names_from = Habbit, values_from = Habit_state)

bac_functions <- read_xlsx (path, 1) %>% #Pathogens and undesirable
  full_join(neuromediators, by = taxon) %>% #Neurotransmitters
  full_join(probiotics, by = taxon) %>% #Probiotics
  full_join(special_properties, by = taxon) %>% #With special properties
  full_join(vitamins, by = taxon) %>%  #Vitamins
  full_join(read_xlsx (path, 6), by = taxon) %>% #Producers of short-chain fatty acids
  full_join(habbits, by = taxon) %>% #Bad habits
  
  unite("Taxon", TaxonName, Rank, sep = "_") %>% 
  filter (Taxon != "Blautia obeum_S") %>% #for this taxon, there is conflicting information in the Producers of SCFA
  mutate_all(as.factor)

rm (path, taxon, neuromediators, probiotics, special_properties, vitamins, habbits)

data_long

# Сonverting the data_wide to a long format
data_long <- data_wide %>% 
  pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
               names_to = "Taxon", values_to = "Percentage") %>% 
  left_join (bac_functions, by = "Taxon") #combining with `bac_functions` datasets

write_rds(data_long, 
          file = "data/originals/data_long.rds",
          compress = "gz") 

#Creation a long dataset for each taxon
G_long <- data_long %>% subset(grepl("_G", Taxon)) 
F_long <- data_long %>% subset(grepl("_F", Taxon))
C_long <- data_long %>% subset(grepl("_C", Taxon))
O_long <- data_long %>% subset(grepl("_O", Taxon))
P_long <- data_long %>% subset(grepl("_P", Taxon))

write_rds(G_long, 
          file = "data/originals/G_long.rds", "gz") 
write_rds(F_long, 
          file = "data/originals/F_long.rds", "gz") 
write_rds(C_long, 
          file = "data/originals/C_long.rds", "gz") 
write_rds(O_long, 
          file = "data/originals/O_long.rds", "gz") 
write_rds(P_long, 
          file = "data/originals/P_long.rds", "gz") 

Analysis

Total percentage means IBS/healthy comparison for G-taxa with special properties

comparison <- function (special_data, special_property) {
#vector of the G_taxa names with special properties:  
    bac_names <-  special_data %>% 
      subset(grepl("_G", Taxon)) %>% .$Taxon
#number of the G_taxa in data_wide
    number_of_taxons <- data_wide %>% 
      select(patient_ID, Health_state, any_of(bac_names)) %>% 
      ncol() - 2
# t-test
t <- data_wide %>% 
  select(patient_ID, Health_state, any_of(bac_names)) %>% 
  pivot_longer(ends_with("_G"),
               names_to = "Taxon", values_to = "Percentage") %>% 
  group_by(patient_ID, Health_state) %>% 
  summarise(across(Percentage, sum)) %>% 
  ungroup() %>% 
  t.test (Percentage ~ Health_state, .)
#a tibble with results
tibble("G_taxa" = special_property, 
       "Number of taxa" = number_of_taxons,
       "Mean of total percentage in IBS" = t$estimate["mean in group Disease"],
       "Mean of total percentage in healthy" = t$estimate["mean in group Health"],
       "95% CI_1" = t$conf.int[1],
       "95% CI_2" = t$conf.int[2],
       "p.unadjusted" = t$p.value)
}


comparison_in_total <- rbind(
    comparison (bac_functions %>% filter (Серотонин == "produce"), "Serotonin producing"),
    comparison (bac_functions %>% filter (Серотонин == "destroy"), "Serotonin destroying"),
    comparison (bac_functions %>% filter (Ацетилхолин == "produce"), "Acetylcholine producing"),
    comparison (bac_functions %>% filter (ГАМК == "produce"), "GABA producing"),
    comparison (bac_functions %>% filter (ГАМК == "destroy"), "GABA destroying"),
    comparison (bac_functions %>% filter (Дофамин == "produce"), "Dopamine producing"),
    comparison (bac_functions %>% filter (Дофамин == "destroy"), "Dopamine destroying"),
    comparison (bac_functions %>% filter (probiotics == 1), "Probiotics"),
    comparison (bac_functions %>% filter (special_properties == 1), "Special_properties"),
    comparison (bac_functions %>% filter (Gases == 1), "Gases producing"),
    comparison (bac_functions %>% filter (Oral == 1), "Oral"),
    comparison (bac_functions %>% filter (Inflammatory == 1), "Inflammatory"),
    comparison (bac_functions %>% filter (Bacteria_category == "Патоген"), "Pathogens"),
    comparison (bac_functions %>% filter (Bacteria_category == "Условно-нормальный"), "Conditionally normal"),
    comparison (bac_functions %>% filter (A == 1), "Vitamin A producing"),
    comparison (bac_functions %>% filter (B1 == 1), "Vitamin B1 producing"),
    comparison (bac_functions %>% filter (B12 == 1), "Vitamin B12 producing"),
    comparison (bac_functions %>% filter (B2 == 1), "Vitamin B2 producing"),
    comparison (bac_functions %>% filter (B3 == 1), "Vitamin B3 producing"),
    comparison (bac_functions %>% filter (B5 == 1), "Vitamin B5 producing"),
    comparison (bac_functions %>% filter (B6 == 1), "Vitamin B6 producing"),
    comparison (bac_functions %>% filter (B7 == 1), "Vitamin B7 producing"),
    comparison (bac_functions %>% filter (B9 == 1), "Vitamin B9 producing"),
    comparison (bac_functions %>% filter (D3 == 1), "Vitamin D3 producing"),
    comparison (bac_functions %>% filter (K2 == 1), "Vitamin K2 producing"),
    comparison (bac_functions %>% filter (Ацетат == 1), "Acetate producing"),
    comparison (bac_functions %>% filter (Пропионат == 1), "Propionate producing"),
    comparison (bac_functions %>% filter (`Масляная кислота` == 1), "Butyrate producing"),
    comparison (bac_functions %>% filter (Кофе == "Увеличена"), "Increase with coffee"),
    comparison (bac_functions %>% filter (Курение == "Увеличена"), "Increase with smoking"),
    comparison (bac_functions %>% filter (Курение == "Уменьшена"), "Derease with smoking"),
    comparison (bac_functions %>% filter (Алкоголь == "Увеличена"), "Increase with alcohol"),
    comparison (bac_functions %>% filter (Алкоголь == "Уменьшена"), "Derease with alcohol")
    ) %>% 
  filter (`Number of taxa` >= 3) %>% 
  add_column(.before = "95% CI_1", "Difference in means" = .$"Mean of total percentage in IBS" - .$"Mean of total percentage in healthy") %>% 
  mutate (across (c ("Mean of total percentage in IBS", "Mean of total percentage in healthy", "Difference in means", "95% CI_1", "95% CI_2"), function (x) round (x,2)),
          p.unadjusted = round (p.unadjusted, 3),
          p.value.holm = p.adjust(p.unadjusted, "holm")) %>% 
  unite("95% CI_1", "95% CI_2", col = "95% CI", sep = ":") %>% 
  arrange(desc (abs (.$"Difference in means")))

write_rds(comparison_in_total, 
          file = "data/originals/comparison_in_total.rds") 

comparison_in_total %>% 
  mutate (p.unadjusted = ifelse(p.unadjusted < 0.001, "< 0.001", p.unadjusted),
          p.value.holm = ifelse(p.value.holm < 0.001, "< 0.001", p.value.holm)) %>%
  flextable() %>% theme_box() %>% 
  align(align = "center", part = "all") %>% 
  flextable::footnote (i = 1, j = 7, value = as_paragraph (c ("Welch Two Sample t-test")),
            ref_symbols = "1", part = "header") %>% 
  set_caption("Results of IBS/healthy people comparison of means of total percentage of G-taxa with special properties")
Results of IBS/healthy people comparison of means of total percentage of G-taxa with special properties

G_taxa

Number of taxa

Mean of total percentage in IBS

Mean of total percentage in healthy

Difference in means

95% CI

p.unadjusted1

p.value.holm

Inflammatory

48

5.34

3.00

2.34

1.65:3.03

< 0.001

< 0.001

Propionate producing

18

22.95

20.66

2.29

0.11:4.48

0.04

0.8

Acetate producing

31

26.19

24.30

1.89

-0.31:4.08

0.092

1

Increase with alcohol

3

15.25

13.38

1.87

-0.14:3.87

0.068

1

Increase with coffee

5

7.71

9.49

-1.78

-2.66:-0.89

< 0.001

< 0.001

Vitamin B12 producing

4

4.69

6.23

-1.54

-2.35:-0.74

< 0.001

< 0.001

Pathogens

7

1.31

0.07

1.24

0.91:1.57

< 0.001

< 0.001

Special_properties

34

11.12

12.28

-1.16

-2.15:-0.17

0.022

0.484

Conditionally normal

50

4.37

3.27

1.10

0.43:1.77

0.001

0.023

Gases producing

9

2.77

2.01

0.76

0.38:1.14

< 0.001

< 0.001

Serotonin destroying

7

2.97

2.25

0.72

0.09:1.36

0.026

0.546

Probiotics

5

1.97

2.48

-0.51

-1.15:0.12

0.115

1

Vitamin B9 producing

9

3.34

3.83

-0.50

-1.14:0.15

0.131

1

Serotonin producing

20

5.75

6.20

-0.45

-1.24:0.34

0.266

1

Increase with smoking

10

2.47

2.02

0.45

-0.1:1

0.11

1

GABA producing

6

2.82

3.27

-0.44

-1.09:0.2

0.175

1

Vitamin D3 producing

4

2.90

3.28

-0.38

-1.04:0.28

0.257

1

Vitamin A producing

4

2.77

3.13

-0.36

-1.02:0.29

0.279

1

Vitamin B1 producing

7

3.00

3.36

-0.36

-1.04:0.31

0.292

1

Vitamin B5 producing

13

3.90

3.57

0.33

-0.13:0.79

0.159

1

Vitamin B2 producing

29

7.46

7.73

-0.27

-1.11:0.57

0.534

1

Derease with smoking

4

3.52

3.34

0.18

-0.52:0.88

0.621

1

Butyrate producing

34

25.06

25.16

-0.11

-2.17:1.95

0.919

1

Vitamin B3 producing

16

6.90

7.00

-0.10

-0.92:0.72

0.811

1

Vitamin B6 producing

25

5.90

5.81

0.09

-0.71:0.89

0.829

1

Vitamin B7 producing

14

3.01

3.09

-0.08

-0.58:0.42

0.75

1

Oral

18

0.80

0.86

-0.07

-0.33:0.19

0.612

1

Vitamin K2 producing

3

0.03

0.03

0.00

-0.02:0.01

0.631

1

1Welch Two Sample t-test

  • Посмотрим на данные здоровых людей и людей с СРК как на многомерный вектор по всем таксонам и группам
combined_bacteria_G <- data_wide %>%
  select(Health_state, ends_with("_G"))

d <- dist(combined_bacteria_G) # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim

df_mds <- data.frame(
  x = fit$points[,1],
  y = fit$points[,2]
  )  

df_full <- cbind(df_mds, combined_info) %>% 
  mutate(Health_state_n = case_when(Health_state == "Health"  ~ 0,
  Health_state == "Disease" ~ 1))

ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
  geom_point() +
  theme_bw() +
  ggtitle("Распределение вектора G-таксонов в зависимости от группы пациентов")

  • Используя метод пермутаций проверим отличаются ли группы в зависимомти от Health_state
adonis2(d ~ Health_state_n, data = df_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs     R2      F Pr(>F)   
## Health_state_n   1     1046 0.0122 4.6817  0.007 **
## Residual       379    84659 0.9878                 
## Total          380    85705 1.0000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Тест Манна-Уитни для сравнения каждого таксона между больными и здоровыми

  • после округления всех значений до целого
Wilcox_comparison_round_0 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,0))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_0),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       100
## Количество значимо различающихся таксонов по p_value_holm   54
## Количество значимо различающихся таксонов по p_value_BH    100
  • после округления всех значений до десятых
Wilcox_comparison_round_1 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,1))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_1),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       273
## Количество значимо различающихся таксонов по p_value_holm  155
## Количество значимо различающихся таксонов по p_value_BH    273
  • после округления всех значений до сотых
Wilcox_comparison_round_2 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,2))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_2),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       599
## Количество значимо различающихся таксонов по p_value_holm  374
## Количество значимо различающихся таксонов по p_value_BH    599

GLM метод для нулей

library("haven")
library("ResourceSelection")  ## Package to perform the Hosmer-Lemeshow GOF test
library("survey")
library("prediction")
library("margins")
library("ggeffects")
library("sjPlot")
library("statmod")
#devtools::install_github("strengejacke/strengejacke")
#install.packages(c("haven", "ResourceSelection", "survey", "prediction", "margins", "ggeffects", "sjPlot", "statmod"))
options(survey.lonely.psu = 'adjust')
start_values <- c(0, 0, 1)

G - Genus (Род)

analyze_taxon <- function(taxon_name, data) {
  tryCatch(
    {
      data_filtered <- data %>%
        filter(Taxon == taxon_name) %>%
        select(patient_ID, Health_state, Percentage, Age_range, research_ID) %>%
        mutate(Health_state_num = ifelse(Health_state == "Health", 0, 1)) # для упрацения интерпритации процентов
      
      mepsdsgn = svydesign(
        id = ~patient_ID,
        strata = ~research_ID,
        weights = NULL,
        data = data_filtered,
        nest = TRUE)
      
      start_values <- c(0, 0, 1)
      
      model <- svyglm(Percentage ~ Health_state_num + research_ID,
                      mepsdsgn,
                      family = tweedie(var.power = 2, link.power = 1),
                      start = start_values)
      
      summary_table <- summary(model)
      
      tidy_output <- tidy(model)
      
      result_table1 <- tidy_output %>%
        filter(term == "Health_state_num") %>%
        select(estimate, p.value)
      
      result_table2 <- as.data.frame(confint(model)["Health_state_num", ])
      
      result_table2_transposed <- t(result_table2)
      
      final_result_table_transposed <- bind_cols(result_table1, result_table2_transposed) %>%
        select(estimate, `2.5 %`, `97.5 %`, p.value)
      
      final_result_table_transposed$Taxon <- taxon_name
      
      return(final_result_table_transposed)
    },
    error = function(e) {
      # Обработка ошибки (можно добавить сообщение или просто вернуть NULL)
      # cat("Error in analyze_taxon for Taxon:", taxon_name, "\n")
      return(NULL)
    }
  )
}

# Применение функции ко всем таксонам
unique_taxa <- unique(G_long$Taxon)

result_list <- lapply(unique_taxa, function(taxon) {
  analyze_result <- analyze_taxon(taxon, G_long)
  if (!is.null(analyze_result)) {
    return(analyze_result)
  } else {
    return(data.frame())  # Вернуть пустой data.frame, чтобы не влиять на bind_rows
  }
})

# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
  mutate(estimate = round(estimate, 3),
         `2.5 %` = round(`2.5 %`, 4),
         `97.5 %` = round(`97.5 %`, 4)
         )

# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)

# Фильтрация только статистически значимых результатов
significant_results_G <- final_result_df %>%
  filter(p.adjusted < 0.05) %>%  arrange(desc(abs(estimate)))

 #install.packages("knitr")
 #install.packages("kableExtra")

# Загрузка библиотек
library(knitr)
library(kableExtra)

# Ваш код

# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_G, "html") %>%
  kable_styling(full_width = F)
estimate 2.5 % 97.5 % p.value Taxon p.adjusted
0.114 0.0529 0.1743 0.0002665 Methanobrevibacter_G 0.031
-0.107 -0.1274 -0.0869 0.0000000 Flavobacterium_G 0.000
0.086 0.0564 0.1152 0.0000000 Senegalimassilia_G 0.000
0.065 0.0375 0.0929 0.0000052 Solobacterium_G 0.001
0.058 0.0444 0.0707 0.0000000 Acidaminobacter_G 0.000
-0.050 -0.0664 -0.0329 0.0000000 Alkaliflexus_G 0.000
0.050 0.0277 0.0724 0.0000141 Faecalitalea_G 0.002
0.018 0.0117 0.0242 0.0000000 Lactiplantibacillus_G 0.000
-0.016 -0.0198 -0.0116 0.0000000 Thermacetogenium_G 0.000
0.013 0.0082 0.0178 0.0000002 Oxalobacter_G 0.000
0.013 0.0095 0.0173 0.0000000 Porphyrobacter_G 0.000
0.013 0.0071 0.0197 0.0000338 Flavonifractor_G 0.004
0.010 0.0062 0.0129 0.0000000 Anoxybacillus_G 0.000
0.009 0.0063 0.0127 0.0000000 Lachnospiraceae NK4B4 group_G 0.000
0.009 0.0071 0.0104 0.0000000 Halochromatium_G 0.000
0.009 0.0068 0.0113 0.0000000 Anaerostignum_G 0.000
-0.009 -0.0113 -0.0075 0.0000000 Erysipelotrichaceae UCG-002_G 0.000
0.007 0.0057 0.0084 0.0000000 Colwellia_G 0.000
0.007 0.0036 0.0112 0.0001343 Marinobacter_G 0.016
-0.007 -0.0101 -0.0038 0.0000201 Candidatus Armantifilum_G 0.003
0.006 0.0047 0.0079 0.0000000 Hespellia_G 0.000
-0.006 -0.0077 -0.0052 0.0000000 Sedimentibacter_G 0.000
-0.006 -0.0086 -0.0030 0.0000666 Rikenella_G 0.008
0.004 0.0032 0.0048 0.0000000 Syntrophomonas_G 0.000
0.004 0.0022 0.0052 0.0000021 Lentilactobacillus_G 0.000
0.004 0.0019 0.0054 0.0000631 Rubrobacter_G 0.008
0.004 0.0030 0.0047 0.0000000 Pseudarthrobacter_G 0.000
0.004 0.0024 0.0058 0.0000022 Acholeplasma_G 0.000
-0.004 -0.0046 -0.0029 0.0000000 Crocinitomix_G 0.000
-0.004 -0.0062 -0.0024 0.0000098 Macellibacteroides_G 0.001
0.003 0.0022 0.0046 0.0000000 Hassallia_G 0.000
0.003 0.0015 0.0040 0.0000102 Halobacillus_G 0.001
0.003 0.0012 0.0040 0.0001789 RB41_G 0.021
0.003 0.0013 0.0041 0.0001600 Antarcticibacterium_G 0.019
-0.003 -0.0036 -0.0016 0.0000005 Leptococcus JA-3-3Ab_G 0.000
-0.003 -0.0037 -0.0016 0.0000009 Desulfurispora_G 0.000
-0.003 -0.0036 -0.0016 0.0000004 Achromobacter_G 0.000
-0.003 -0.0047 -0.0019 0.0000036 Coriobacteriaceae UCG-002_G 0.001
-0.003 -0.0031 -0.0021 0.0000000 Formosa_G 0.000
-0.003 -0.0039 -0.0022 0.0000000 Paramaledivibacter_G 0.000
-0.003 -0.0043 -0.0014 0.0001027 Hydrogenispora_G 0.013
-0.003 -0.0033 -0.0021 0.0000000 Serpentinicella_G 0.000
-0.003 -0.0037 -0.0021 0.0000000 Oxobacter_G 0.000
-0.003 -0.0035 -0.0023 0.0000000 Lachnotalea_G 0.000
-0.003 -0.0032 -0.0023 0.0000000 Subsaximicrobium_G 0.000
-0.003 -0.0046 -0.0017 0.0000214 Salinibacter_G 0.003
-0.003 -0.0040 -0.0013 0.0000805 XBB1006_G 0.010
-0.003 -0.0042 -0.0016 0.0000109 Pygmaiobacter_G 0.001
-0.003 -0.0045 -0.0016 0.0000334 Marinifilum_G 0.004
0.002 0.0011 0.0023 0.0000001 Anaerospora_G 0.000
0.002 0.0012 0.0023 0.0000000 Truepera_G 0.000
0.002 0.0010 0.0028 0.0000743 Nocardia_G 0.009
-0.002 -0.0031 -0.0015 0.0000001 Acetoanaerobium_G 0.000
-0.002 -0.0026 -0.0010 0.0000107 Chthoniobacter_G 0.001
0.002 0.0012 0.0038 0.0001601 Spirochaeta 2_G 0.019
-0.001 -0.0013 -0.0006 0.0000014 Arcicella_G 0.000
-0.001 -0.0014 -0.0006 0.0000029 Pseudoclostridium_G 0.000
-0.001 -0.0014 -0.0006 0.0000020 Clostridiisalibacter_G 0.000
0.001 0.0007 0.0017 0.0000031 Sporanaerobacter_G 0.000
0.001 0.0008 0.0017 0.0000000 Leucobacter_G 0.000
-0.001 -0.0018 -0.0007 0.0000117 Virgibacillus_G 0.002
-0.001 -0.0015 -0.0007 0.0000001 [Eubacterium] yurii group_G 0.000
-0.001 -0.0020 -0.0008 0.0000040 Aeriscardovia_G 0.001
-0.001 -0.0020 -0.0009 0.0000008 Blvii28 wastewater-sludge group_G 0.000
-0.001 -0.0021 -0.0008 0.0000194 Izimaplasma_G 0.003
-0.001 -0.0021 -0.0007 0.0000428 Lawsonella_G 0.005

F - Family (семейство)

# Применение функции ко всем таксонам
unique_taxa <- unique(F_long$Taxon)

result_list <- lapply(unique_taxa, function(taxon) {
  analyze_result <- analyze_taxon(taxon, F_long)
  if (!is.null(analyze_result)) {
    return(analyze_result)
  } else {
    return(data.frame())  # Вернуть пустой data.frame, чтобы не влиять на bind_rows
  }
})

# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
  mutate(estimate = round(estimate, 3),
         `2.5 %` = round(`2.5 %`, 4),
         `97.5 %` = round(`97.5 %`, 4)
         )

# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)

# Фильтрация только статистически значимых результатов
significant_results_F <- final_result_df %>%
  filter(p.adjusted < 0.05) %>%  arrange(desc(abs(estimate)))

# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_F, "html") %>%
  kable_styling(full_width = F)
estimate 2.5 % 97.5 % p.value Taxon p.adjusted
0.891 0.7287 1.0532 0.0000000 Pedosphaeraceae_F 0.000
0.301 0.2544 0.3471 0.0000000 A4b_F 0.000
-0.080 -0.1152 -0.0445 0.0000117 Methanobacteriaceae_F 0.001
0.063 0.0480 0.0783 0.0000000 Chromatiaceae_F 0.000
0.058 0.0444 0.0707 0.0000000 Acidaminobacteraceae_F 0.000
-0.055 -0.0739 -0.0360 0.0000000 Clade II_F 0.000
-0.021 -0.0329 -0.0097 0.0003409 Puniceicoccaceae_F 0.017
-0.016 -0.0198 -0.0117 0.0000000 Thermacetogeniaceae_F 0.000
0.010 0.0054 0.0150 0.0000326 Sphingomonadaceae_F 0.002
0.007 0.0036 0.0111 0.0001574 Micromonosporaceae_F 0.008
0.007 0.0036 0.0112 0.0001343 Marinobacteraceae_F 0.007
0.006 0.0034 0.0078 0.0000009 WD2101 soil group_F 0.000
0.006 0.0051 0.0073 0.0000000 Iamiaceae_F 0.000
0.006 0.0038 0.0079 0.0000000 Colwelliaceae_F 0.000
0.006 0.0037 0.0082 0.0000004 Hydrogenedensaceae_F 0.000
0.006 0.0043 0.0076 0.0000000 NS11-12 marine group_F 0.000
-0.006 -0.0077 -0.0052 0.0000000 Sedimentibacteraceae_F 0.000
-0.004 -0.0057 -0.0016 0.0004772 Didymellaceae_F 0.024
0.004 0.0022 0.0059 0.0000247 Thermomonosporaceae_F 0.001
0.004 0.0031 0.0055 0.0000000 Syntrophomonadaceae_F 0.000
-0.004 -0.0042 -0.0032 0.0000000 Aureobasidiaceae_F 0.000
0.004 0.0019 0.0054 0.0000631 Rubrobacteriaceae_F 0.004
0.003 0.0013 0.0040 0.0001420 Pyrinomonadaceae_F 0.008
0.003 0.0019 0.0033 0.0000000 Obscuribacteraceae_F 0.000
0.003 0.0020 0.0034 0.0000000 Limnochordaceae_F 0.000
-0.003 -0.0036 -0.0016 0.0000005 Leptococcaceae_F 0.000
-0.003 -0.0037 -0.0016 0.0000009 Desulfurisporaceae_F 0.000
-0.003 -0.0044 -0.0013 0.0003096 Salinivirgaceae_F 0.016
0.003 0.0020 0.0038 0.0000000 Xanthobacteraceae_F 0.000
-0.003 -0.0037 -0.0021 0.0000000 Oxobacteraceae_F 0.000
-0.003 -0.0043 -0.0017 0.0000112 Chthoniobacteraceae_F 0.001
-0.003 -0.0040 -0.0014 0.0000790 Pirellulaceae_F 0.004
-0.002 -0.0033 -0.0015 0.0000001 Trichocomaceae_F 0.000
0.002 0.0012 0.0023 0.0000000 Trueperaceae_F 0.000
0.002 0.0016 0.0030 0.0000000 type III_F 0.000
-0.002 -0.0026 -0.0012 0.0000004 Izemoplasmataceae_F 0.000
0.002 0.0007 0.0025 0.0006969 Rs-E47 termite group_F 0.033
0.002 0.0019 0.0030 0.0000000 09D2Z48_F 0.000
-0.002 -0.0038 -0.0010 0.0006464 TRA3-20_F 0.031
0.002 0.0020 0.0028 0.0000000 Simkaniaceae_F 0.000
-0.002 -0.0033 -0.0015 0.0000001 PeH15_F 0.000
-0.001 -0.0012 -0.0003 0.0005701 Bacteriovoracaceae_F 0.028
0.001 0.0006 0.0021 0.0006965 Halothiobacillaceae_F 0.033
-0.001 -0.0019 -0.0009 0.0000003 Cladosporiaceae_F 0.000
-0.001 -0.0020 -0.0008 0.0000131 01D2Z36_F 0.001
0.001 0.0009 0.0017 0.0000000 Streptosporangiaceae_F 0.000

C - Class (класс)

# Применение функции ко всем таксонам
unique_taxa <- unique(C_long$Taxon)

result_list <- lapply(unique_taxa, function(taxon) {
  analyze_result <- analyze_taxon(taxon, C_long)
  if (!is.null(analyze_result)) {
    return(analyze_result)
  } else {
    return(data.frame())  # Вернуть пустой data.frame, чтобы не влиять на bind_rows
  }
})

# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
  mutate(estimate = round(estimate, 3),
         `2.5 %` = round(`2.5 %`, 4),
         `97.5 %` = round(`97.5 %`, 4)
         )

# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)

# Фильтрация только статистически значимых результатов
significant_results_C <- final_result_df %>%
  filter(p.adjusted < 0.05) %>%  arrange(desc(abs(estimate)))

# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_C, "html") %>%
  kable_styling(full_width = F)
estimate 2.5 % 97.5 % p.value Taxon p.adjusted
-0.080 -0.1152 -0.0445 1.17e-05 Methanobacteria_C 0.000
0.036 0.0275 0.0435 0.00e+00 Thermodesulfovibrionia_C 0.000
0.022 0.0172 0.0265 0.00e+00 Armatimonadia_C 0.000
-0.017 -0.0222 -0.0124 0.00e+00 Dothideomycetes_C 0.000
-0.016 -0.0198 -0.0117 0.00e+00 Thermacetogenia_C 0.000
0.014 0.0105 0.0166 0.00e+00 Acidimicrobiia_C 0.000
0.012 0.0090 0.0144 0.00e+00 Chlamydiae_C 0.000
0.010 0.0072 0.0126 0.00e+00 Dehalococcoidia_C 0.000
0.009 0.0064 0.0108 0.00e+00 Desulfotomaculia_C 0.000
0.006 0.0037 0.0082 4.00e-07 Hydrogenedentia_C 0.000
0.006 0.0034 0.0078 1.00e-06 Vicinamibacteria_C 0.000
-0.006 -0.0074 -0.0036 0.00e+00 D8A-2_C 0.000
0.006 0.0042 0.0076 0.00e+00 Thermoleophilia_C 0.000
0.005 0.0030 0.0071 2.80e-06 Ignavibacteria_C 0.000
-0.004 -0.0052 -0.0026 0.00e+00 Eurotiomycetes_C 0.000
0.004 0.0031 0.0055 0.00e+00 Syntrophomonadia_C 0.000
0.004 0.0019 0.0054 6.31e-05 Rubrobacteria_C 0.001
0.004 0.0026 0.0055 0.00e+00 vadinHA49_C 0.000
0.003 0.0017 0.0042 4.70e-06 MB-A2-108_C 0.000

O - Order (порядок)

# Применение функции ко всем таксонам
unique_taxa <- unique(O_long$Taxon)

result_list <- lapply(unique_taxa, function(taxon) {
  analyze_result <- analyze_taxon(taxon, O_long)
  if (!is.null(analyze_result)) {
    return(analyze_result)
  } else {
    return(data.frame())  # Вернуть пустой data.frame, чтобы не влиять на bind_rows
  }
})

# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
  mutate(estimate = round(estimate, 3),
         `2.5 %` = round(`2.5 %`, 4),
         `97.5 %` = round(`97.5 %`, 4)
         )

# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)

# Фильтрация только статистически значимых результатов
significant_results_O <- final_result_df %>%
  filter(p.adjusted < 0.05) %>%  arrange(desc(abs(estimate)))

# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_O, "html") %>%
  kable_styling(full_width = F)
estimate 2.5 % 97.5 % p.value Taxon p.adjusted
0.891 0.7287 1.0532 0.0000000 Pedosphaerales_O 0.000
-0.080 -0.1152 -0.0445 0.0000117 Methanobacteriales_O 0.000
0.062 0.0465 0.0773 0.0000000 Chromatiales_O 0.000
0.024 0.0176 0.0306 0.0000000 Rhizobiales_O 0.000
0.022 0.0172 0.0265 0.0000000 Armatimonadales_O 0.000
-0.016 -0.0198 -0.0117 0.0000000 Thermacetogeniales_O 0.000
-0.012 -0.0141 -0.0100 0.0000000 Candidatus Kerfeldbacteria_O 0.000
-0.011 -0.0162 -0.0051 0.0001987 Hypocreales_O 0.007
0.010 0.0087 0.0120 0.0000000 LD1-PA32_O 0.000
0.010 0.0054 0.0150 0.0000326 Sphingomonadales_O 0.001
0.008 0.0059 0.0106 0.0000000 Microtrichales_O 0.000
0.008 0.0054 0.0096 0.0000000 Desulfotomaculales_O 0.000
0.007 0.0036 0.0111 0.0001574 Micromonosporales_O 0.006
-0.007 -0.0093 -0.0046 0.0000000 Chthoniobacterales_O 0.000
-0.006 -0.0092 -0.0033 0.0000447 Pleosporales_O 0.002
0.006 0.0037 0.0082 0.0000004 Hydrogenedentiales_O 0.000
0.005 0.0032 0.0077 0.0000024 Tepidisphaerales_O 0.000
-0.005 -0.0051 -0.0040 0.0000000 Dothideales_O 0.000
0.004 0.0031 0.0055 0.0000000 Syntrophomonadales_O 0.000
0.004 0.0019 0.0054 0.0000631 Rubrobacterales_O 0.002
0.004 0.0014 0.0058 0.0015949 Entomoplasmatales_O 0.048
0.003 0.0013 0.0040 0.0001420 Pyrinomonadales_O 0.005
0.003 0.0019 0.0033 0.0000000 Obscuribacterales_O 0.000
-0.003 -0.0036 -0.0016 0.0000005 Eurycoccales_O 0.000
0.003 0.0022 0.0037 0.0000000 Limnochordales_O 0.000
0.003 0.0019 0.0034 0.0000000 eub62A3_O 0.000
0.003 0.0015 0.0053 0.0003859 Synechococcales_O 0.012
-0.003 -0.0042 -0.0017 0.0000072 RBG-13-54-9_O 0.000
-0.003 -0.0040 -0.0014 0.0000790 Pirellulales_O 0.003
0.002 0.0009 0.0027 0.0000904 Candidatus Abawacabacteria_O 0.003
0.002 0.0010 0.0021 0.0000002 Candidatus Peregrinibacteria_O 0.000
-0.002 -0.0029 -0.0012 0.0000018 Capnodiales_O 0.000
0.002 0.0011 0.0020 0.0000000 Candidatus Terrybacteria_O 0.000
-0.002 -0.0033 -0.0015 0.0000001 Eurotiales_O 0.000
0.002 0.0017 0.0033 0.0000000 Gaiellales_O 0.000
0.002 0.0009 0.0032 0.0003267 Ignavibacteriales_O 0.011
-0.001 -0.0012 -0.0003 0.0005701 Bacteriovoracales_O 0.018
0.001 0.0008 0.0018 0.0000027 S085_O 0.000

P - Phylum (тип)

# Применение функции ко всем таксонам
unique_taxa <- unique(P_long$Taxon)

result_list <- lapply(unique_taxa, function(taxon) {
  analyze_result <- analyze_taxon(taxon, P_long)
  if (!is.null(analyze_result)) {
    return(analyze_result)
  } else {
    return(data.frame())  # Вернуть пустой data.frame, чтобы не влиять на bind_rows
  }
})

# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
  mutate(estimate = round(estimate, 3),
         `2.5 %` = round(`2.5 %`, 4),
         `97.5 %` = round(`97.5 %`, 4)
         )

# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)

# Фильтрация только статистически значимых результатов
significant_results_P <- final_result_df %>%
  filter(p.adjusted < 0.05) %>%  arrange(desc(abs(estimate)))

# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_P, "html") %>%
  kable_styling(full_width = F)
estimate 2.5 % 97.5 % p.value Taxon p.adjusted
-15.536 -22.0641 -9.0082 4.00e-06 Firmicutes_P 0.000
-0.080 -0.1152 -0.0445 1.17e-05 Euryarchaeota_P 0.000
0.038 0.0292 0.0458 0.00e+00 Nitrospirota_P 0.000
0.038 0.0302 0.0467 0.00e+00 Armatimonadota_P 0.000
-0.015 -0.0212 -0.0090 1.50e-06 Basidiomycota_P 0.000
0.006 0.0037 0.0082 4.00e-07 Hydrogenedentes_P 0.000
0.006 0.0032 0.0097 9.66e-05 Myxococcota_P 0.001

Распределения таксонов G(Genus - Род)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups <- function(filter_pattern, step_size, dataset_name) {
  
  # Получаем уникальные таксоны, удовлетворяющие условиям
  unique_taxa <- Wilcox_comparison_round_2 %>%
    subset(grepl(filter_pattern, Taxon)) %>%
    filter(p_value_holm < 0.05) %>%
    distinct(Taxon)
  
  # Разбиваем уникальные таксоны на группы по step_size
  taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
  
  # Проходим по группам и строим графики
  for (i in seq_along(taxon_groups)) {
    current_taxa <- taxon_groups[[i]]
    
    current_dataset <- get(dataset_name)  # Получаем датасет по его имени
    
    G_long_filtered <- current_dataset %>%
      filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
      group_by(Health_state, Taxon) %>%
      summarise(Count = n(), .groups = "drop")
    
    bar_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Count, fill = Health_state == "Disease")) +
      geom_bar(position = "dodge", color = "black", stat = "identity") +
      scale_fill_manual(values = c("skyblue", "red"), guide = FALSE) +
      labs(title = paste("Гистограмма для таксонов", (i - 1) * step_size + 1, "до", min(i * step_size, nrow(unique_taxa)), "в разрезе Health_state"),
           x = "Статус пациента",
           y = "Количество пациентов, у которых обнаружен данный таксон") +
      coord_flip() +
      facet_wrap(~Taxon, scales = "free_y", ncol = 2, shrink = 0.7) +  # Уменьшение пространства между фасетами
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5),  # Поворот текста на оси X и уменьшение шрифта
            axis.text.y = element_text(hjust = 1, size = 5),
            strip.text = element_text(size = 5))  # Уменьшение шрифта для названия фасет
    
    print(bar_plot)  # Отображение графика на экране
    
  }
}

# Пример вызова функции с передачей "_G" в качестве аргумента и названия датасета "G_long"
create_and_plot_taxon_groups("_G", 18, "G_long")

график солнышко

create_and_plot_circular_diagrams <- function(dataset, filter_pattern, step_size, filename) {
  
  # Получаем уникальные таксоны, удовлетворяющие условиям
  unique_taxa <- Wilcox_comparison_round_2 %>%
    subset(grepl(filter_pattern, Taxon)) %>%
    filter(p_value_holm < 0.05) %>%
    distinct(Taxon)
  
  # Разбиваем уникальные таксоны на группы по step_size
  taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
  
  # Проходим по группам и строим графики
  for (i in seq_along(taxon_groups)) {
    current_taxa <- taxon_groups[[i]]
    
    G_long_filtered <- dataset %>%
      filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
      group_by(Health_state, Taxon) %>%
      summarise(Count = n(), .groups = "drop")
    
    # Добавляем строки с Count = 0 для всех комбинаций Health_state и Taxon
    G_long_filtered <- G_long_filtered %>%
      complete(Health_state, Taxon, fill = list(Count = 0))
    
    data_id <- G_long_filtered %>% filter(G_long_filtered$Health_state == "Disease") %>%
      mutate(id = row_number()) %>% 
      select(Taxon, id)
    
    G_long_filtered <- left_join(G_long_filtered, data_id, by = "Taxon")
    
    labels_data <- G_long_filtered %>% filter(Health_state == "Disease")
    number_of_bars <- nrow(labels_data)
    
    #Вычислим углы для лейбла каждого барра
    labels_data$angel <- 90 - 360 * (labels_data$id-0.5) / number_of_bars
    
    # Добавим горизонтольную регулировку
    labels_data <- labels_data %>%
      mutate(hjust = ifelse(angel < -90, 1, 0))
    
    # Перевернем лейбл в зависимости от "полушария"
    labels_data <- labels_data %>%
      mutate(angel = ifelse(angel < -90, angel + 180, angel))
    
    # Круговая диаграмма для объединенного датасета
    p <- ggplot(data = G_long_filtered, mapping = aes(x = id, y = Count, fill = Health_state)) + 
      geom_bar(stat = "identity", position = "stack") + 
      ylim(-150, 500) +
      
      # Добавляем кастомную тему
      theme_minimal(base_size = 8) +
      theme(axis.text   = element_blank(),
            axis.title  = element_blank(),
            panel.grid  = element_blank(),
            plot.margin = unit(rep(-1, 6), "cm")) +
      coord_polar(start = 0)+
      geom_text(data = labels_data,
                aes(x = id, y = Count,
                    label = Taxon,
                    hjust = hjust),
                color = "black",
                fontface = "bold",
                alpha = 0.9,
                size = 1.5,
                angle = labels_data$angel,
                inherit.aes = FALSE) +
      # Добавляем горизонтальные линии сетки
      geom_hline(yintercept = seq(0, 300, by = 50),
                 linetype = "dotted",
                 color = "gray",
                 size = 0.5) +
      # Добавляем значения рядом с линиями сетки
      annotate("text", x = 1.2, y = seq(0, 300, by = 50),
               label = seq(0, 300, by = 50),
               color = "black",
               size = 3,
               alpha = 40,
               hjust = 0)
    
    p <- p + annotate("text", x = 0.5, y = 490, label = "Сотношение здоровых пациентов и пациентов с СРК",
                      size = 5, color = "black", fontface = "bold", hjust = 0.5)
    
    # Сформируем имя файла для сохранения
    current_filename <- paste0(filename, "_", i, ".png")
    
    # Сохраняем график под уникальным именем
    ggsave(current_filename, p, width = 10, height = 8)
    
    print(p)
  }
}

# Пример вызова функции
create_and_plot_circular_diagrams(G_long, "_G", 40, "output_plot")

Боксплот по возрасту

create_and_plot_boxplots <- function(G_long, filter_pattern, step_size, x_var, title) {
  
  # Получаем уникальные таксоны, удовлетворяющие условиям
  unique_taxa <- Wilcox_comparison_round_2 %>%
    subset(grepl(filter_pattern, Taxon)) %>%
    filter(p_value_holm < 0.05) %>%
    distinct(Taxon)
  
  # Разбиваем уникальные таксоны на группы по step_size
  taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
  
  # Проходим по группам и строим боксплоты
  for (i in seq_along(taxon_groups)) {
    current_taxa <- taxon_groups[[i]]
    
    G_long_filtered <- G_long %>%
      filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
    
    # Создаем формулу для переменной x_var
    formula <- as.formula(paste("Age ~", x_var))
    
    # Строим боксплот
    box_plot <- ggplot(G_long_filtered, aes_string(x = x_var, y = "Age", fill = "Health_state")) +
      geom_boxplot() +
      labs(title = paste(title),
           x = x_var,
           y = "Age") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  # Уменьшаем размер шрифта для подписей Taxon
            axis.title.x = element_blank(),  # Убираем название оси X
            legend.position = "bottom") +  # Перемещаем легенду вниз
      scale_fill_manual(values = c("red", "skyblue")) +
      facet_grid(~Taxon, scales = "free_y", space = "free")  # Используем facet_grid вместо facet_wrap
    
    # Отображаем график на экране
    print(box_plot)
  }
}

# Пример вызова функции с Health_state в качестве x_var
create_and_plot_boxplots(G_long, "_G", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")

Боксплот по возросту и полу

create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")

Боксплот по возросту и статусу курения

create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")

Распределения таксонов F Family(семейство)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_F", 18, "F_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(F_long, "_F", 40, "F_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(F_long, "_F", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")

Боксплот по возросту и полу

create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")

Боксплот по возросту и статусу курения

create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")

Распределения таксонов C - Class (класс)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_C", 18, "C_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(C_long, "_C", 40, "F_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(C_long, "_C", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")

Боксплот по возросту и полу

create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")

Боксплот по возросту и статусу курения

create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")

Распределения таксонов O - Order (порядок)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_O", 18, "O_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(O_long, "_O", 30, "F_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(O_long, "_O", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")

Боксплот по возросту и полу

create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")

Боксплот по возросту и статусу курения

create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")

Распределения таксонов P - Phylum (тип)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_P", 18, "P_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(P_long, "_P", 30, "P_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(P_long, "_P", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")

Боксплот по возросту и полу

create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")

Боксплот по возросту и статусу курения

create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")

Посмотрим на данные здоровых людей и людей с СРК как на многомерный вектор по всем таксонам и группам

G - Genus

perform_permutation_test <- function(data, filter_pattern) {
  
  combined_bacteria_G <- data %>%
    select(Health_state, ends_with(filter_pattern))
  
  d <- dist(combined_bacteria_G) # euclidean distances between the rows
  fit <- cmdscale(d, eig=TRUE, k=2) # k is the number of dim
  
  df_mds <- data.frame(
    x = fit$points[,1],
    y = fit$points[,2]
  )  
  
  df_full <- cbind(df_mds, combined_info) %>% 
    mutate(Health_state_n = case_when(Health_state == "Health"  ~ 0,
                                      Health_state == "Disease" ~ 1))
  
  msd_plot <- ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
    geom_point() +
    theme_bw() +
    ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")
  
  print(msd_plot)
  
  print("Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state")
  
  adonis2(d ~ Health_state_n, data = df_full)
}

# Пример вызова функции с использованием другого фильтра
perform_permutation_test(data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_G")

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs     R2      F Pr(>F)   
## Health_state_n   1     1046 0.0122 4.6817  0.006 **
## Residual       379    84659 0.9878                 
## Total          380    85705 1.0000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F - Family (семейство)

perform_permutation_test(data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_F")

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)   
## Health_state_n   1     1919 0.01423 5.4706  0.002 **
## Residual       379   132940 0.98577                 
## Total          380   134859 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

C - Class (класс)

perform_permutation_test(data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_C")

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)    
## Health_state_n   1    32982 0.12662 54.948  0.001 ***
## Residual       379   227490 0.87338                  
## Total          380   260472 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

O - Order (порядок)

perform_permutation_test(data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_O")

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2     F Pr(>F)    
## Health_state_n   1     6923 0.04767 18.97  0.001 ***
## Residual       379   138309 0.95233                 
## Total          380   145232 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

P - Phylum (тип)

perform_permutation_test(data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_P")

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)    
## Health_state_n   1    32728 0.12502 54.155  0.001 ***
## Residual       379   229042 0.87498                  
## Total          380   261770 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Влияние серотонинпродуцирующих бактерий

library("lme4")
library("stringr")


#Переменная имеет биномиальное распределение
plot(data_long$Серотонин)

#Процент пропущенных значений в переменной Серотонин 96,67%
na_serotonin <- sum(is.na(data_long$Серотонин)) / nrow(data_long) * 100


#Данные о серотонине есть только в F и G taxons
serotonin_taxons <- data_long %>%
  filter(!is.na(`Серотонин`)) %>%
  mutate(Serotonin_taxons = str_sub(Taxon, -2)) %>%
  distinct(Serotonin_taxons) %>%
  pull(Serotonin_taxons)

#При этом только 2 F таксона и 42 G таксона
serotonin_taxons_count_unique <- bac_functions %>%
  filter(!is.na(Серотонин)) %>%
  mutate(Serotonin_taxons = str_sub(Taxon, -2)) %>%
  count(Serotonin_taxons)

#А всего 219 F в датасете
num_unique_F <- data_long %>%
  filter(str_detect(Taxon, "_F$")) %>%
  summarise(Num_Unique = n_distinct(Taxon)) %>%
  pull(Num_Unique)

###Добавляем данные по родам из двух семейств в bac_functions (данные добавлены в 8-й лист Bacterial group functions.xlsx)

# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")

neuromediators_1 <- read_xlsx (path, 8) %>% 
  mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>% 
  unique() %>%
  pivot_wider(names_from = Neuromediator, values_from = Destroy)

probiotics <- read_xlsx (path, 3) %>% 
  add_column(probiotics = 1)

special_properties <- read_xlsx (path, 4) %>% 
  add_column(special_properties = 1)

vitamins <- read_xlsx (path, 5) %>% 
  pivot_wider (names_from = Vitamin, values_from = Vitamin,
               values_fn = function(x) ifelse(is.na (x), NA, 1))

habbits <- read_xlsx (path, 7) %>%
  unique() %>% #удаление повторяющихся строк 
  pivot_wider(names_from = Habbit, values_from = Habit_state)

bac_functions_1 <- read_xlsx (path, 1) %>% #Патогены и нежелательные
  full_join(neuromediators_1, by = taxon) %>% #Нейромедиаторы
  full_join(probiotics, by = taxon) %>% #Пробиотики
  full_join(special_properties, by = taxon) %>% #С особыми свойствами
  full_join(vitamins, by = taxon) %>%  #Витамины
  full_join(read_xlsx (path, 6), by = taxon) %>% #Продуценты КЦДК
  full_join(habbits, by = taxon) %>% #Вредные привычки
  
  unite("Taxon", TaxonName, Rank, sep = "_") %>% 
  filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
  mutate_all(as.factor)

rm (path, taxon, neuromediators_1, probiotics, special_properties, vitamins, habbits)


data_long_1 <- data_wide %>% 
  pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
               names_to = "Taxon", values_to = "Percentage")

#Перезапись data_long с добавлением функций бактерий
data_long_1 <- data_long_1 %>% 
  left_join (bac_functions_1, by = "Taxon")


G_long_1 <- data_long_1 %>% subset(grepl("_G", Taxon)) 

#Модель отдельно для таксонов G уровня

G_long_serotonin <- G_long_1 %>%
  filter( ! is.na(`Серотонин`)) %>%
  filter(Percentage > 0.0000)

G_long_serotonin$`Серотонин` <- relevel(G_long_serotonin$`Серотонин`, ref = "destroy")

G_long_serotonin$research_ID <- as.factor(G_long_serotonin$research_ID)
G_long_serotonin$Seq_date <- as.factor(G_long_serotonin$Seq_date)
G_long_serotonin$Seq_date <- as.factor(G_long_serotonin$Seq_date)
G_long_serotonin$patient_ID <- as.factor(G_long_serotonin$patient_ID)



model_ser_G_1 <- glmer(Health_state ~ Серотонин +  (1 | research_ID), data = G_long_serotonin, family = binomial)
summary(model_ser_G_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Health_state ~ Серотонин + (1 | research_ID)
##    Data: G_long_serotonin
## 
##      AIC      BIC   logLik deviance df.resid 
##   8002.6   8027.1  -3998.3   7996.6    26740 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.58164 -0.00200  0.00227  0.00291  0.67902 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  research_ID (Intercept) 482.6    21.97   
## Number of obs: 26743, groups:  research_ID, 7
## 
## Fixed effects:
##                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)       0.30153    2.04143   0.148   0.8826  
## Серотонинproduce  0.14273    0.07277   1.961   0.0498 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Серотонинpr 0.000

Модель не показывает значимой ассоциации наличия серотонинпродуцирующих бактерий и состоянием здоровья человека (СРК/здоровый).

#Boruta for Random Forest, only G-level taxa

data_boruta <- data_wide_batched %>%
  select(ends_with("_G") | "Health_state") # selection of taxa _G in dataset corrected for batch-effect (id reasearch)
library(Boruta)
Boruta(Health_state ~ ., data_boruta, ntree = 1000, maxRuns = 1000) %>% #Boruta 
  TentativeRoughFix() -> boruta_trained

boruta_trained %>%
  attStats() %>%
  rownames_to_column("Переменная") %>%
  mutate(`Переменная` = `Переменная` %>% fct_reorder(`meanImp`)) %>% 
  filter(decision == "Confirmed") %>% 
  select("Переменная") -> boruta_trained_confirmed #dataset with significant variables

Boruta Visualization

boruta_trained %>%
  attStats() %>%
  rownames_to_column("Переменная") %>%
  mutate(`Переменная` = `Переменная` %>% fct_reorder(`meanImp`)) %>% filter(decision == "Confirmed") %>%
  
  ggplot(aes(y = `Переменная`, x = meanImp, colour = decision, size = 4)) +
  geom_point(aes(size = 8)) +
  geom_errorbar(aes(xmin = minImp, xmax = maxImp, width = 3)) +
  xlab("Среднее снижение энтропии") +
  labs(color = "Значимость переменной") +
  theme(legend.position = "bottom", axis.text = element_text(size =70)) 

Random Forest

RF Dataset

data_wide_batched %>%
  select(c(boruta_trained_confirmed$Переменная, Health_state)) -> data_RF

data_RF %>%
  map(function(x) sum(is.na(x)) / length(x)) %>%
  enframe() %>%
  unnest(cols = value) %>%
  arrange(desc(value))
## # A tibble: 179 × 2
##    name                          value
##    <chr>                         <dbl>
##  1 Hassallia_G                       0
##  2 Cetobacterium_G                   0
##  3 Erysipelotrichaceae UCG-009_G     0
##  4 Glutamicibacter_G                 0
##  5 Halobacillus_G                    0
##  6 Mucispirillum_G                   0
##  7 Acetanaerobacterium_G             0
##  8 Dethiosulfatibacter_G             0
##  9 Syntrophomonas_G                  0
## 10 FD2005_G                          0
## # ℹ 169 more rows
data_RF%>%
  select(where(is.factor)) %>%
  map(table)
## $Health_state
## 
## Disease  Health 
##     170     211

RF Test/train data

library(rsample)
data_RF <- data_RF[sample(nrow(data_RF)), ]

split_train_RandFor <- initial_split(data_RF, strata = Health_state, prop = 0.9)

data_RandFor_train <- split_train_RandFor %>% training()
data_RandFor_test <- split_train_RandFor %>% testing() 

RF Test data (only research ID = 4)

data_RandFor_test_alternative <- data_wide %>% #data for test of model, only research 4, becase both value of variable Health_state
  filter(research_ID == 4)
cat_metric <- yardstick::metric_set(
  
    yardstick::bal_accuracy,
    yardstick::precision,
    yardstick::recall,
    yardstick::f_meas,
    yardstick::sensitivity,
    yardstick::specificity,
    yardstick::j_index
  
  )

RF Preparing recipe

library(themis)
library(embed)

data_recipe_RandFor <- recipe(Health_state ~ ., data_RandFor_train) %>%
  
  step_impute_bag(all_numeric_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors()) %>%
  step_lincomb(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors()) 

RF Model

library(parsnip)
rf_model <- rand_forest(mode = "classification", mtry = tune(), trees = tune(), min_n = tune()) %>%
  set_engine("ranger")

RF Samples

cv_samples <- vfold_cv(data_RandFor_train, strata = Health_state, v = 10)

RF Parameters grid

library(dials)

parameters_grid <- grid_max_entropy(mtry(range = c(3, 10)), trees(), min_n(), size = 20)

RF Pipeline

library(workflows)
reg_workflow <- workflow() %>%
  add_recipe(data_recipe_RandFor) %>%
  add_model(rf_model)

RF Education of model

library(tidymodels)
grid_search <- reg_workflow %>%
  
 tune_grid(
    
    object = reg_workflow,
    resamples = cv_samples,
    grid = parameters_grid,
    control = control_grid(save_pred = TRUE),

    metrics = metric_set(sensitivity, specificity, j_index)

  )

RF Visualization

grid_search %>%
  collect_metrics() %>%
  ggplot(aes(trees, mean, color = .metric)) + 
  geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.8) +
  geom_line(size = 1)

RF Best model selection

best_by_j_index <- grid_search %>% select_best("j_index")

final_reg_model <- finalize_workflow(
  
  reg_workflow,
  best_by_j_index
  
)

RF Metrics comparison on train/test data

# test data

final_reg_model %>%
  fit(data_RandFor_train) %>%
  predict(data_RandFor_test_alternative) %>%
  pull() -> final_RandFor_test_prediction

metrics_on_test <- cat_metric(truth = truth_values, estimate = estimate_values, tibble(truth_values = data_RandFor_test_alternative$Health_state, estimate_values = final_RandFor_test_prediction)) %>% rename(test_estimate = `.estimate`) %>% select(!`.estimator`)

# train data

final_reg_model %>%
  fit(data_RandFor_train) %>%
  predict(data_RandFor_train) %>%
  pull() -> final_RandFor_train_prediction

metrics_on_train <- cat_metric(truth = truth_values, estimate = estimate_values, tibble(truth_values = data_RandFor_train$Health_state, estimate_values = final_RandFor_train_prediction)) %>% rename(train_estimate = `.estimate`) %>% select(!`.estimator`)

# binding

metrics_on_test %>%
  left_join(metrics_on_train, by = ".metric") %>%
  mutate(differencies = train_estimate - test_estimate)
## # A tibble: 7 × 4
##   .metric      test_estimate train_estimate differencies
##   <chr>                <dbl>          <dbl>        <dbl>
## 1 bal_accuracy             1              1            0
## 2 precision                1              1            0
## 3 recall                   1              1            0
## 4 f_meas                   1              1            0
## 5 sensitivity              1              1            0
## 6 specificity              1              1            0
## 7 j_index                  1              1            0

RF Model results

last_fit(
  
  final_reg_model,
  split_train_RandFor
  
) -> final_RF_model

final_RF_model %>% write_rds("RandomForest.rds") #saving model

final_RF_model %>%
  extract_workflow() %>%
  predict(data_RandFor_test_alternative, type = "class") %>%
  pull() -> class_prediction

final_RF_model %>%
  extract_workflow() %>%
  predict(data_RandFor_test_alternative, type = "prob") %>%
  pull() -> prob_prediction

RF_model_results <- tibble(truth = data_RandFor_test_alternative$Health_state,
                            estimate = class_prediction,
                            prob_yes = 1 - prob_prediction)

cat_metric(truth = truth, estimate = estimate, RF_model_results) 
## # A tibble: 7 × 3
##   .metric      .estimator .estimate
##   <chr>        <chr>          <dbl>
## 1 bal_accuracy binary             1
## 2 precision    binary             1
## 3 recall       binary             1
## 4 f_meas       binary             1
## 5 sensitivity  binary             1
## 6 specificity  binary             1
## 7 j_index      binary             1
RF_model_results %>%
  roc_curve(truth = truth, prob_yes) %>%
  autoplot()